bitutils2/bin_regex/
mod.rs

1//! Provides routines for searching binary data structures (implementing [`BitIndexable`]) for specified patterns
2//!
3//! # Syntax
4//!
5//! ### Basic Matches
6//!<pre class="rust">
7//! _        => Matches single 0 bit
8//! '        => Matches single 1 bit
9//! .        => Matches any single bit
10//! !        => Matches any byte
11//! 0-f      => Matches corresponding nibble (case insensitive)
12//!</pre>
13//!
14//! ### Achors
15//!<pre class="rust">
16//! :        => Matches byte boundary
17//! ;        => Matches nibble boundary
18//!</pre>
19//!
20//! ### Repetitions
21//!<pre class="rust">
22//! *        => Causes RE to match zero or more repetitions of the preceding group (greedy)
23//! +        => Causes RE to match one or more repetitions of the preceding group (greedy)
24//! ?        => Causes RE to match zero or one repetitions of the preceding group (greedy)
25//! {n}      => Causes RE to match exactly n repetitions of the preceding group
26//! {n,}     => Causes RE to match n or more repetitions of the preceding group (greedy)
27//! {n,m}    => Causes RE to match between n and m repetitions of the preceding group (greedy)
28//! *?       => Causes RE to match zero or more repetitions of the preceding group (lazy)
29//! +?       => Causes RE to match one or more repetitions of the preceding group (lazy)
30//! ??       => Causes RE to match zero or one repetitions of the preceding group (lazy)
31//! {n}?     => Same as {n}
32//! {n,}?    => Causes RE to match n or more repetitions of the preceding group (lazy)
33//! {n,m}?   => Causes RE to match between n and m repetitions of the preceding group (lazy)
34//!</pre>
35//!
36//! ### Groups
37//!<pre class="rust">
38//! (exp)    => Numbered capture group
39//! (?<name>exp) => Named (also numbered) capture group
40//! (?:exp)  => Non-capturing group
41//!</pre>
42//!
43//! ### Character Classes
44//!<pre class="rust">
45//! [a8:xyzA-Z3-9]    => 8-bit ASCII
46//! [u8:1,2,3,4..10]  => 8-bit unsigned integer
47//! [i8:1,2,3,4..10]  => 8-bit two's compliment integer
48//! [o8:1,2,3,4..10]  => 8-bit one's compliment integer
49//! [s8:1,2,3,4..10]  => 8-bit sign-magnitude integer
50//! [n8:1,2,3,4..10]  => 8-bit negabinary integer
51//! [f16:1,2,3,4..10]  => 16-bit floating point (IEEE-754 Half)
52//! [f32:1,2,3,4..10]  => 32-bit floating point (IEEE-754 Single)
53//! [f64:1,2,3,4..10]  => 64-bit floating point (IEEE-754 Double)
54//! [f128:1,2,3,4..10]  => 128-bit floating point (IEEE-754 Quadruple)
55//!</pre>
56//!
57//! Matches any set of `n` bits that is listed in the class expression when interpreted according to the character class flag.
58//!
59//! `a`: Values listed in the class expression are interpreted as ASCII
60//! `u`: Values listed in the class expression are interpreted as unsigned integers
61//!
62//! # Compositions
63//!<pre class="rust">
64//! x|y      => Matches x or y
65//! xy       => Matches x followed by y
66//!</pre>
67
68use std::collections::HashSet;
69
70mod parse_helpers;
71use crate::bin_regex::parse_helpers::{escape_vec, find_right_delimiter};
72
73use crate::bx;
74use crate::bit_index::{BitIndex, BitIndexable};
75use crate::bit_field::{IntFormat, BitField, FromBitField};
76
77#[derive(Debug, Clone, PartialEq)]
78enum Greediness {
79    Greedy,
80    Lazy
81}
82
83#[derive(Debug, Clone, PartialEq)]
84enum Repeat {
85    Exactly(usize),
86    LessThan(usize, Greediness),
87    Any(Greediness)
88}
89
90#[derive(Debug, Clone, PartialEq)]
91enum GroupType {
92    NonCapturing,
93    Numbered,
94    Named(String)
95}
96
97
98#[derive(Debug, PartialEq)]
99enum DynamicCharacterClass {
100    UInt(CharClass<UInt>),
101    TwosCompInt(CharClass<TwosCompInt>),
102    OnesCompInt(CharClass<OnesCompInt>),
103    SignMagInt(CharClass<SignMagInt>),
104    NegaInt(CharClass<NegaInt>),
105    Float16(CharClass<SembFloat<1, 5, 10, 15>>),
106    Float32(CharClass<SembFloat<1, 8, 23, 127>>),
107    Float64(CharClass<SembFloat<1, 11, 52, 1023>>),
108    Float128(CharClass<SembFloat<1, 15, 112, 16383>>)
109}
110
111impl DynamicCharacterClass {
112    fn input_length(&self) -> BitIndex {
113        match self {
114            DynamicCharacterClass::UInt(cls) => cls.input_length(),
115            DynamicCharacterClass::TwosCompInt(cls) => cls.input_length(),
116            DynamicCharacterClass::OnesCompInt(cls) => cls.input_length(),
117            DynamicCharacterClass::SignMagInt(cls) => cls.input_length(),
118            DynamicCharacterClass::NegaInt(cls) => cls.input_length(),
119            DynamicCharacterClass::Float16(cls) => cls.input_length(),
120            DynamicCharacterClass::Float32(cls) => cls.input_length(),
121            DynamicCharacterClass::Float64(cls) => cls.input_length(),
122            DynamicCharacterClass::Float128(cls) => cls.input_length(),
123        }
124    }
125
126    fn matches(&self, input: &BitField) -> bool {
127        match self {
128            DynamicCharacterClass::UInt(cls) => cls.matches(input),
129            DynamicCharacterClass::TwosCompInt(cls) => cls.matches(input),
130            DynamicCharacterClass::OnesCompInt(cls) => cls.matches(input),
131            DynamicCharacterClass::SignMagInt(cls) => cls.matches(input),
132            DynamicCharacterClass::NegaInt(cls) => cls.matches(input),
133            DynamicCharacterClass::Float16(cls) => cls.matches(input),
134            DynamicCharacterClass::Float32(cls) => cls.matches(input),
135            DynamicCharacterClass::Float64(cls) => cls.matches(input),
136            DynamicCharacterClass::Float128(cls) => cls.matches(input)
137        }
138    }
139}
140
141#[derive(Debug, Clone, PartialEq)]
142enum Token {
143    Repetition(Repeat, Box<Token>),
144    Group(Option<usize>, Vec<Token>),
145    Choice(Vec<Token>),
146    CharacterClass(std::rc::Rc<DynamicCharacterClass>),
147    Nibble(u8),
148    ByteBoundary,
149    NibbleBoundary,
150    BitZero,
151    BitOne,
152    BitAny,
153    ByteAny,
154}
155
156#[derive(Debug, Clone)]
157enum StateFlag {
158    GroupStart(usize),
159    GroupEnd(usize),
160    NotOffsetAnd(usize) // Proceeds if (offset & value) == 0
161}
162
163#[derive(Debug, Clone)]
164enum StateTransition {
165    Free,
166    EqualsBit(u8),
167    EqualsNibble(u8),
168    ConsumeBits(usize),
169    CharacterClass(std::rc::Rc<DynamicCharacterClass>)
170}
171
172// enum TextEncoding {
173//     Ascii,
174//     Utf8,
175//     Utf16,
176//     Hex
177// }
178
179// trait CharClass: std::fmt::Debug {
180//     fn input_length(&self) -> BitIndex;
181//     fn matches(&self, input: &BitField) -> bool;
182// }
183
184// enum CharClassType {
185//     Ascii,
186//     UInt(usize),
187//     IInt(usize)
188// }
189
190// #[derive(PartialEq, Eq, Debug)]
191// struct U8CharClass {
192//     inverted: bool,
193//     options: HashSet<UInt>,
194//     ranges: Vec<(u8, u8)>
195// }
196
197// impl CharClass for U8CharClass {
198//     fn input_length(&self) -> BitIndex {BitIndex::new(1, 0)}
199//     fn matches(&self, input: &BitField) -> bool {
200//         let x = input.extract_u8(BitIndex::new(0, 0));
201//         if self.options.contains(&x) {
202//             return !self.inverted
203//         }
204//         for (start, end) in &self.ranges {
205//             if *start <= x && x <= *end {
206//                 return !self.inverted
207//             }
208//         }
209//         return self.inverted
210//     }
211// }
212
213// #[derive(std::hash::Hash, Debug)]
214// struct UInt {
215//     inner: u128
216// }
217
218// impl std::str::FromStr for UInt {
219//     type Err = std::num::ParseIntError;
220
221//     fn from_str(s: &str) -> Result<Self, Self::Err> {
222//         Ok(UInt{inner: u128::from_str(s)?})
223//     }
224// }
225
226#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
227struct UInt(u128);
228
229impl FromBitField for UInt {
230    fn from_bf_be(bf: &BitField) -> UInt {
231        let mut bf = bf.clone();
232        let n = bf.len().total_bits();
233        if n > 128 {
234            bf.truncate_be(BitIndex::bits(128));
235        } else if n < 128 {
236            bf.pad_unsigned_be(BitIndex::bits(128));
237        }
238
239        UInt(u128::from_be_bytes(bf.into_array().unwrap()))
240    }
241
242    fn from_bf_le(bf: &BitField) -> UInt {
243        let mut bf = bf.clone();
244        let n = bf.len().total_bits();
245        if n > 128 {
246            bf.truncate_le(BitIndex::bits(128));
247        } else if n < 128 {
248            bf.pad_unsigned_le(BitIndex::bits(128));
249        }
250
251        UInt(u128::from_le_bytes(bf.into_array().unwrap()))
252    }
253}
254
255impl std::str::FromStr for UInt {
256    type Err = std::num::ParseIntError;
257
258    fn from_str(s: &str) -> Result<Self, Self::Err> {
259        Ok(UInt(u128::from_str(s)?))
260    }
261}
262
263impl std::cmp::PartialOrd for UInt {
264    fn partial_cmp(&self, other: &UInt) -> Option<std::cmp::Ordering> {
265        self.0.partial_cmp(&other.0)
266    }
267}
268
269#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
270struct TwosCompInt(i128);
271
272impl FromBitField for TwosCompInt {
273    fn from_bf_be(bf: &BitField) -> TwosCompInt {
274        let mut bf = bf.clone();
275        let n = bf.len().total_bits();
276        if n > 128 {
277            bf.truncate_be(BitIndex::bits(128));
278        } else if n < 128 {
279            bf.pad_twos_compliment_be(BitIndex::bits(128));
280        }
281
282        TwosCompInt(i128::from_be_bytes(bf.into_array().unwrap()))
283    }
284
285    fn from_bf_le(bf: &BitField) -> TwosCompInt {
286        let mut bf = bf.clone();
287        let n = bf.len().total_bits();
288        if n > 128 {
289            bf.truncate_le(BitIndex::bits(128));
290        } else if n < 128 {
291            bf.pad_twos_compliment_le(BitIndex::bits(128));
292        }
293
294        TwosCompInt(i128::from_le_bytes(bf.into_array().unwrap()))
295    }
296}
297
298impl std::str::FromStr for TwosCompInt {
299    type Err = std::num::ParseIntError;
300
301    fn from_str(s: &str) -> Result<Self, Self::Err> {
302        Ok(TwosCompInt(i128::from_str(s)?))
303    }
304}
305
306impl std::cmp::PartialOrd for TwosCompInt {
307    fn partial_cmp(&self, other: &TwosCompInt) -> Option<std::cmp::Ordering> {
308        self.0.partial_cmp(&other.0)
309    }
310}
311
312#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
313struct OnesCompInt(i128);
314
315impl FromBitField for OnesCompInt {
316    fn from_bf_be(bf: &BitField) -> OnesCompInt {
317        let mut bf = bf.clone();
318        let n = bf.len().total_bits();
319        if n > 128 {
320            bf.truncate_be(BitIndex::bits(128));
321        } else if n < 128 {
322            // Two's compliment padding and one's compliment padding are the same
323            bf.pad_twos_compliment_be(BitIndex::bits(128));
324        }
325
326        let neg = bf.convert_unsigned_be(IntFormat::OnesCompliment);
327        let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
328        if neg {
329            i *= -1
330        }
331        OnesCompInt(i)
332    }
333
334    fn from_bf_le(bf: &BitField) -> OnesCompInt {
335        let mut bf = bf.clone();
336        let n = bf.len().total_bits();
337        if n > 128 {
338            bf.truncate_le(BitIndex::bits(128));
339        } else if n < 128 {
340            // Two's compliment padding and one's compliment padding are the same
341            bf.pad_twos_compliment_le(BitIndex::bits(128));
342        }
343
344        let neg = bf.convert_unsigned_le(IntFormat::OnesCompliment);
345        let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
346        if neg {
347            i *= -1
348        }
349        OnesCompInt(i)
350    }
351}
352
353impl std::str::FromStr for OnesCompInt {
354    type Err = std::num::ParseIntError;
355
356    fn from_str(s: &str) -> Result<Self, Self::Err> {
357        Ok(OnesCompInt(i128::from_str(s)?))
358    }
359}
360
361impl std::cmp::PartialOrd for OnesCompInt {
362    fn partial_cmp(&self, other: &OnesCompInt) -> Option<std::cmp::Ordering> {
363        self.0.partial_cmp(&other.0)
364    }
365}
366
367#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
368struct SignMagInt(i128);
369
370impl FromBitField for SignMagInt {
371    fn from_bf_be(bf: &BitField) -> SignMagInt {
372        let mut bf = bf.clone();
373        let n = bf.len().total_bits();
374        if n > 128 {
375            bf.truncate_be(BitIndex::bits(128));
376        } else if n < 128 {
377            bf.pad_sign_magnitude_be(BitIndex::bits(128));
378        }
379
380        let neg = bf.convert_unsigned_be(IntFormat::SignMagnitude);
381        let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
382        if neg {
383            i *= -1
384        }
385        SignMagInt(i)
386    }
387
388    fn from_bf_le(bf: &BitField) -> SignMagInt {
389        let mut bf = bf.clone();
390        let n = bf.len().total_bits();
391        if n > 128 {
392            bf.truncate_le(BitIndex::bits(128));
393        } else if n < 128 {
394            bf.pad_sign_magnitude_le(BitIndex::bits(128));
395        }
396
397        let neg = bf.convert_unsigned_le(IntFormat::SignMagnitude);
398        let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
399        if neg {
400            i *= -1
401        }
402        SignMagInt(i)
403    }
404}
405
406impl std::str::FromStr for SignMagInt {
407    type Err = std::num::ParseIntError;
408
409    fn from_str(s: &str) -> Result<Self, Self::Err> {
410        Ok(SignMagInt(i128::from_str(s)?))
411    }
412}
413
414impl std::cmp::PartialOrd for SignMagInt {
415    fn partial_cmp(&self, other: &SignMagInt) -> Option<std::cmp::Ordering> {
416        self.0.partial_cmp(&other.0)
417    }
418}
419
420#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
421struct NegaInt(i128);
422
423impl FromBitField for NegaInt {
424    fn from_bf_be(bf: &BitField) -> NegaInt {
425        let mut bf = bf.clone();
426        let n = bf.len().total_bits();
427        if n > 128 {
428            bf.truncate_be(BitIndex::bits(128));
429        } else if n < 128 {
430            // Unsigned padding and negabinary padding are the same
431            bf.pad_unsigned_be(BitIndex::bits(128));
432        }
433
434        let neg = bf.convert_unsigned_be(IntFormat::BaseMinusTwo);
435        let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
436        if neg {
437            i *= -1
438        }
439        NegaInt(i)
440    }
441
442    fn from_bf_le(bf: &BitField) -> NegaInt {
443        let mut bf = bf.clone();
444        let n = bf.len().total_bits();
445        if n > 128 {
446            bf.truncate_le(BitIndex::bits(128));
447        } else if n < 128 {
448            // Unsigned padding and negabinary padding are the same
449            bf.pad_unsigned_le(BitIndex::bits(128));
450        }
451
452        let neg = bf.convert_unsigned_le(IntFormat::BaseMinusTwo);
453        let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
454        if neg {
455            i *= -1
456        }
457        NegaInt(i)
458    }
459}
460
461impl std::str::FromStr for NegaInt {
462    type Err = std::num::ParseIntError;
463
464    fn from_str(s: &str) -> Result<Self, Self::Err> {
465        Ok(NegaInt(i128::from_str(s)?))
466    }
467}
468
469impl std::cmp::PartialOrd for NegaInt {
470    fn partial_cmp(&self, other: &NegaInt) -> Option<std::cmp::Ordering> {
471        self.0.partial_cmp(&other.0)
472    }
473}
474
475#[derive(Clone, Debug)]
476struct Float32(f32);
477impl Float32 {
478    fn canonical_form(&self) -> u32 {
479        let mut u = self.0.clone().to_bits();
480
481        // If NaN...
482        if u & 0x7F800000 == 0x7F800000 {
483            // for NaN, mask all but the most significant mantissa 
484            // bit (the quiet/signaling bit) since those bits don't
485            // matter
486            u &= 0xFFC00000;
487            // Set the least significant mantissa bit to 1 to differentiate
488            // it from infinity
489            u |= 0x00000001;
490            
491        }
492        u
493    }
494}
495impl FromBitField for Float32 {
496    fn from_bf_be(bf: &BitField) -> Float32 {
497        let mut bf = bf.clone();
498        let n = bf.len().total_bits();
499        if n > 32 {
500            bf.truncate_be(BitIndex::bits(32));
501        } else if n < 32 {
502            panic!("Float 32 cannot be formed from a BitField with length {}", bf.len())
503        }
504
505        Float32(f32::from_be_bytes(bf.into_array().unwrap()))
506    }
507
508    fn from_bf_le(bf: &BitField) -> Float32 {
509        let mut bf = bf.clone();
510        let n = bf.len().total_bits();
511        if n > 32 {
512            bf.truncate_le(BitIndex::bits(32));
513        } else if n < 32 {
514            panic!("Float 32 cannot be formed from a BitField with length {}", bf.len())
515        }
516
517        Float32(f32::from_le_bytes(bf.into_array().unwrap()))
518    }
519}
520
521
522impl std::hash::Hash for Float32 {
523    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
524        self.canonical_form().hash(state);
525    }
526}
527
528impl PartialEq for Float32 {
529    fn eq(&self, other: &Float32) -> bool {
530        self.canonical_form() == other.canonical_form()
531    }
532}
533
534impl Eq for Float32 {}
535
536impl std::str::FromStr for Float32 {
537    type Err = std::num::ParseFloatError;
538
539    fn from_str(s: &str) -> Result<Self, Self::Err> {
540        match s.to_lowercase().as_str() {
541            "snan" | "+snan" => {
542                // Canonical form for positive signaling NAN
543                Ok(Float32(f32::from_bits(0x7F800001)))
544            },
545            "-snan" => {
546                // Canonical form for negative signaling NAN
547                Ok(Float32(f32::from_bits(0xFF800001)))
548            },
549            "qnan" | "+qnan" => {
550                // Canonical form for positive quiet NAN
551                Ok(Float32(f32::from_bits(0x7FC00001)))
552            },
553            "-qnan" => {
554                // Canonical form for negative quiet NAN
555                Ok(Float32(f32::from_bits(0xFFC00001)))
556            },
557            _ => Ok(Float32(f32::from_str(s)?))
558        }
559        
560    }
561}
562
563impl std::cmp::PartialOrd for Float32 {
564    fn partial_cmp(&self, other: &Float32) -> Option<std::cmp::Ordering> {
565        self.0.partial_cmp(&other.0)
566    }
567}
568
569use crate::Semb;
570#[derive(Clone, Debug)]
571struct SembFloat<const S: usize, const E: usize, const M: usize, const B: i32>(Semb<S, E, M, B>);
572
573impl<const S: usize, const E: usize, const M: usize, const B: i32> FromBitField for SembFloat<S, E, M, B> {
574    fn from_bf_be(bf: &BitField) -> SembFloat<S, E, M, B> {
575        SembFloat(Semb::from_bf_be(bf))
576    }
577
578    fn from_bf_le(bf: &BitField) -> SembFloat<S, E, M, B> {
579        SembFloat(Semb::from_bf_le(bf))
580    }
581}
582
583impl<const S: usize, const E: usize, const M: usize, const B: i32> std::hash::Hash for SembFloat<S, E, M, B> {
584    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
585        self.0.canonical_form().hash(state);
586    }
587}
588
589impl<const S: usize, const E: usize, const M: usize, const B: i32> PartialEq for SembFloat<S, E, M, B> {
590    fn eq(&self, other: &SembFloat<S, E, M, B>) -> bool {
591        self.0.canonical_form() == other.0.canonical_form()
592    }
593}
594
595impl<const S: usize, const E: usize, const M: usize, const B: i32> std::cmp::Eq for SembFloat<S, E, M, B> {}
596
597
598impl<const S: usize, const E: usize, const M: usize, const B: i32> std::str::FromStr for SembFloat<S, E, M, B> {
599    type Err = crate::semb::ParseSembError;
600
601    fn from_str(s: &str) -> Result<Self, Self::Err> {
602        Ok(SembFloat(Semb::from_str(s)?))
603        
604    }
605}
606
607impl<const S: usize, const E: usize, const M: usize, const B: i32> std::cmp::PartialOrd for SembFloat<S, E, M, B> {
608    fn partial_cmp(&self, other: &SembFloat<S, E, M, B>) -> Option<std::cmp::Ordering> {
609        self.0.partial_cmp(&other.0)
610    }
611}
612
613#[derive(PartialEq, Eq, Debug)]
614enum Endian {
615    Big,
616    Little
617}
618
619#[derive(PartialEq, Debug)]
620struct CharClass<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::cmp::Eq> {
621    inverted: bool,
622    endian: Endian,
623    options: HashSet<T>,
624    ranges: Vec<(T, T)>,
625    input_length: BitIndex
626}
627
628impl<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::cmp::Eq + std::fmt::Debug> CharClass<T> {
629    fn input_length(&self) -> BitIndex {
630        self.input_length
631    }
632
633    fn matches(&self, input: &BitField) -> bool {
634        let x: T = match self.endian {
635            Endian::Big => T::from_bf_be(input),
636            Endian::Little => T::from_bf_le(input)
637        };
638        //println!("TRYING TO MATCH {:?}", x);
639        if self.options.contains(&x) {
640            return !self.inverted
641        }
642        for (start, end) in &self.ranges {
643            if *start <= x && x <= *end {
644                return !self.inverted
645            }
646        }
647        return self.inverted
648    }
649}
650
651fn parse_ascii_char_class(input: &Vec<char>, endian: Endian) -> Result<CharClass<UInt>, String> {
652    let mut input_iter = escape_vec(input).peekable();
653    let mut options = HashSet::<UInt>::new();
654    let mut ranges = Vec::<(UInt, UInt)>::new();
655    let mut prev_char: Option<UInt> = None;
656    // let mut escaped = false;
657    let mut range_started = false;
658
659    let inverted = if input_iter.peek() == Some(&('^', false)) {
660        input_iter.next();
661        true
662    } else {
663        false
664    };
665
666    for (c, escaped) in input_iter {
667        match c {
668            '-' if !escaped => {
669                match prev_char {
670                    Some(_) if range_started => return Err("Unexpected '-' encountered after '-'".to_string()),
671                    Some(_) => {
672                        range_started = true;
673                        continue
674                    },
675                    None => return Err("Unexpected '-' encountered at beginning of character class".to_string()),
676                }
677            },
678            _ if range_started => {
679                match prev_char {
680                    Some(ref p) => {
681                        ranges.push((p.clone(), UInt(c as u128)));
682                        prev_char = None;
683                    },
684                    None => return Err("Impossible state".to_string())
685                }
686            },
687            _ => {
688                match prev_char {
689                    Some(ref p) => {
690                        options.insert(p.clone());
691                    },
692                    None => ()
693                }
694                prev_char = Some(UInt(c as u128));
695            }
696        }
697        range_started = false;
698
699    }
700    match prev_char {
701        Some(ref p) => {
702            options.insert(p.clone());
703        },
704        None => ()
705    }
706    Ok(CharClass::<UInt> {inverted, endian, options, ranges, input_length: BitIndex::new(1, 0)})
707}
708
709enum RangeParseProgress<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + std::cmp::Eq> {
710    NotStarted,
711    OneDot(T),
712    TwoDot(T)
713}
714
715fn parse_int_char_class<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + Eq>(input: &Vec<char>, nbits: usize, endian: Endian) -> Result<CharClass<T>, String>
716where <T as std::str::FromStr>::Err: std::fmt::Display {
717    let input_length = bx!(,nbits);
718    let mut input_iter = escape_vec(input).peekable();
719    let mut options = HashSet::<T>::new();
720    let mut ranges = Vec::<(T, T)>::new();
721    let mut prev_word = Vec::<char>::new();
722    let mut range_prog = RangeParseProgress::<T>::NotStarted;
723
724    let inverted = if input_iter.peek() == Some(&('^', false)) {
725        input_iter.next();
726        true
727    } else {
728        false
729    };
730
731    for (c, escaped) in input_iter {
732        if escaped {
733            return Err("Escaped characters invalid in decimal character class".to_string());
734        }
735        match c {
736            ',' => {
737                match prev_word.iter().collect::<String>().parse::<T>() {
738                    Ok(n) => {
739                        match range_prog {
740                            RangeParseProgress::NotStarted => {
741                                options.insert(n);
742                            },
743                            RangeParseProgress::TwoDot(n0) => {
744                                ranges.push((n0, n));
745                                range_prog = RangeParseProgress::NotStarted;
746                            },
747                            _ => return Err(format!("Encountered unexpected character '{}' while parsing range", c))
748                        }
749                    },
750                    Err(msg) => return Err(msg.to_string())
751                }
752                prev_word.clear();
753            },
754            '.' => {
755                match range_prog {
756                    RangeParseProgress::NotStarted => {
757                        match prev_word.iter().collect::<String>().parse::<T>() {
758                            Ok(n) => {
759                                range_prog = RangeParseProgress::OneDot(n);
760                                prev_word.clear();
761                            },
762                            Err(msg) => return Err(msg.to_string())
763                        }
764                    },
765                    RangeParseProgress::OneDot(n0) => {
766                        if !prev_word.is_empty() {
767                            return Err("Impossible State".to_string())
768                        }
769                        range_prog = RangeParseProgress::TwoDot(n0);
770                    },
771                    RangeParseProgress::TwoDot(_) => {
772                        return Err(format!("Encountered unexpected character '{}' while parsing range", c))
773                    }
774                }
775            },
776            '-' | '+' | '0'..='9' => {
777                match range_prog {
778                    RangeParseProgress::OneDot(_) => {
779                        return Err(format!("Encountered unexpected character '{}' while parsing range", c))
780                    },
781                    _ => ()
782                }
783                prev_word.push(c);
784            },
785            _ => return Err(format!("Encountered unexpected character '{}' while parsing decimal character class", c))
786        }
787
788    }
789
790    if prev_word.is_empty() {
791        return Err("Encountered end of character class before expected".to_string())
792    }
793    match prev_word.iter().collect::<String>().parse::<T>() {
794        Ok(n) => {
795            match range_prog {
796                RangeParseProgress::NotStarted => {
797                    options.insert(n);
798                },
799                RangeParseProgress::TwoDot(n0) => {
800                    ranges.push((n0, n));
801                },
802                _ => return Err("Encountered end of character class before expected".to_string())
803            }
804        },
805        Err(msg) => return Err(msg.to_string())
806    }
807
808    Ok(CharClass::<T> {inverted, endian, options, ranges, input_length})
809}
810
811fn parse_num_char_class<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + Eq>(input: &Vec<char>, nbits: usize, endian: Endian) -> Result<CharClass<T>, String>
812where <T as std::str::FromStr>::Err: std::fmt::Display {
813    let input_length = bx!(,nbits);
814    let mut input_iter = input.into_iter().peekable();
815    let mut options = HashSet::<T>::new();
816    let mut ranges = Vec::<(T, T)>::new();
817
818    let inverted = if input_iter.peek() == Some(&&'^') {
819        input_iter.next();
820        true
821    } else {
822        false
823    };
824
825    let body: String = input_iter.collect();
826
827    for word in body.split(",") {
828        match word.split_once("..") {
829            Some((lhs, rhs)) => {
830                let lhs = match lhs.parse::<T>() {
831                    Ok(n) => n,
832                    Err(err) => return Err(err.to_string())
833                };
834                let rhs = match rhs.parse::<T>() {
835                    Ok(n) => n,
836                    Err(err) => return Err(err.to_string())
837                };
838                ranges.push((lhs, rhs));
839            },
840            None => {
841                let value = match word.parse::<T>() {
842                    Ok(n) => n,
843                    Err(err) => return Err(err.to_string())
844                };
845                options.insert(value);
846            }
847        }
848    }
849
850    Ok(CharClass::<T> {inverted, endian, options, ranges, input_length})
851}
852
853fn parse_char_class(input: &Vec<char>) -> Result<Token, String> {
854    let mut from_start = Vec::<char>::new();
855    for (i, c) in input.iter().enumerate() {
856        match c {
857            '<' | '>' | '0'..='9'|'a'..='z'|'A'..='Z' => {
858                from_start.push(*c);
859            },
860            ':' => {
861                let mut char_class_type_index = 0;
862                let endian = match from_start[0] {
863                    '<' => {
864                        char_class_type_index += 1;
865                        Endian::Little
866                    },
867                    '>' => {
868                        char_class_type_index += 1;
869                        Endian::Big
870                    },
871                    _ => Endian::Big
872                };
873
874                match from_start[(char_class_type_index + 1)..].iter().collect::<String>().parse::<usize>() {
875                    Ok(n) => {
876                        match from_start[char_class_type_index] {
877                            'a' => {
878                                let cls_result = parse_ascii_char_class(&input[i+1..].to_vec(), endian);
879                                match cls_result {
880                                    Ok(cls) => {
881                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
882                                    },
883                                    Err(msg) => return Err(msg)
884                                }
885                            },
886                            'u' => {
887                                let cls_result = parse_num_char_class::<UInt>(&input[i+1..].to_vec(), n, endian);
888                                match cls_result {
889                                    Ok(cls) => {
890                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
891                                    },
892                                    Err(msg) => return Err(msg)
893                                }
894                            },
895                            'i' => {
896                                let cls_result = parse_num_char_class::<TwosCompInt>(&input[i+1..].to_vec(), n, endian);
897                                match cls_result {
898                                    Ok(cls) => {
899                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::TwosCompInt(cls))))
900                                    },
901                                    Err(msg) => return Err(msg)
902                                }
903                            },
904                            'o' => {
905                                let cls_result = parse_num_char_class::<OnesCompInt>(&input[i+1..].to_vec(), n, endian);
906                                match cls_result {
907                                    Ok(cls) => {
908                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::OnesCompInt(cls))))
909                                    },
910                                    Err(msg) => return Err(msg)
911                                }
912                            },
913                            's' => {
914                                let cls_result = parse_num_char_class::<SignMagInt>(&input[i+1..].to_vec(), n, endian);
915                                match cls_result {
916                                    Ok(cls) => {
917                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::SignMagInt(cls))))
918                                    },
919                                    Err(msg) => return Err(msg)
920                                }
921                            },
922                            'n' => {
923                                let cls_result = parse_num_char_class::<NegaInt>(&input[i+1..].to_vec(), n, endian);
924                                match cls_result {
925                                    Ok(cls) => {
926                                        return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::NegaInt(cls))))
927                                    },
928                                    Err(msg) => return Err(msg)
929                                }
930                            },
931                            'f' => {
932                                match n {
933                                    16 => {
934                                        let cls_result = parse_num_char_class::<SembFloat<1, 5, 10, 15>>(&input[i+1..].to_vec(), n, endian);
935                                        match cls_result {
936                                            Ok(cls) => {
937                                                return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float16(cls))))
938                                            },
939                                            Err(msg) => return Err(msg)
940                                        }
941                                    },
942                                    32 => {
943                                        let cls_result = parse_num_char_class::<SembFloat<1, 8, 23, 127>>(&input[i+1..].to_vec(), n, endian);
944                                        match cls_result {
945                                            Ok(cls) => {
946                                                return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float32(cls))))
947                                            },
948                                            Err(msg) => return Err(msg)
949                                        }
950                                    },
951                                    64 => {
952                                        let cls_result = parse_num_char_class::<SembFloat<1, 11, 52, 1023>>(&input[i+1..].to_vec(), n, endian);
953                                        match cls_result {
954                                            Ok(cls) => {
955                                                return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float64(cls))))
956                                            },
957                                            Err(msg) => return Err(msg)
958                                        }
959                                    },
960                                    128 => {
961                                        let cls_result = parse_num_char_class::<SembFloat<1, 15, 112, 16383>>(&input[i+1..].to_vec(), n, endian);
962                                        match cls_result {
963                                            Ok(cls) => {
964                                                return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float128(cls))))
965                                            },
966                                            Err(msg) => return Err(msg)
967                                        }
968                                    },
969                                    _ => {
970                                        return Err(format!("{}-bit float format not recognized", n))
971                                    }
972                                }
973                                
974                            },
975                            _ => return Err(format!("Character class type '{}' not recognized", from_start.iter().collect::<String>()))
976                        }
977                    },
978                    Err(msg) => return Err(msg.to_string())
979                }
980            },
981            _ => break
982        }
983    }
984    match parse_ascii_char_class(input, Endian::Big) {
985        Ok(cls) => {
986            Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
987        },
988        Err(msg) => Err(msg)
989    }
990}
991
992fn compile(input: &Vec<Token>, start_index: usize) -> Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>> {
993    let mut result = Vec::<Vec<(usize, StateTransition, Vec<StateFlag>)>>::new();
994    let mut i = start_index;
995    for token in input {
996        match token {
997            Token::Repetition(Repeat::Any(greediness), inside) => {
998                let inside_result = compile(&vec![*inside.clone()], i + 1);
999                let inside_len = inside_result.len();
1000                match greediness {
1001                    Greediness::Greedy => {
1002                        result.push(vec![(i + 1, StateTransition::Free, vec![]), (i + inside_len + 2, StateTransition::Free, vec![])]);
1003                        result.extend(inside_result);
1004                        result.push(vec![(i + 1, StateTransition::Free, vec![]), (i + inside_len + 2, StateTransition::Free, vec![])]);
1005                    },
1006                    Greediness::Lazy => {
1007                        result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![]), (i + 1, StateTransition::Free, vec![])]);
1008                        result.extend(inside_result);
1009                        result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![]), (i + 1, StateTransition::Free, vec![])]);
1010                    }
1011                }
1012
1013                i += inside_len + 2;
1014            },
1015            Token::Repetition(Repeat::Exactly(n), inside) => {
1016                for _ in 0..*n {
1017                    let inside_result = compile(&vec![*inside.clone()], i);
1018                    i += inside_result.len();
1019                    result.extend(inside_result);
1020                }
1021            },
1022            Token::Repetition(Repeat::LessThan(n, greediness), inside) => {
1023                let start_i = i - start_index;
1024                result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1025                i += 1;
1026                for _ in 0..(*n - 1) {
1027                    result[start_i].push((i, StateTransition::Free, vec![]));
1028                    let inside_result = compile(&vec![*inside.clone()], i);
1029                    i += inside_result.len();
1030                    result.extend(inside_result);
1031                }
1032                result[start_i].push((i, StateTransition::Free, vec![]));
1033                if matches!(greediness, Greediness::Lazy) {
1034                    result[start_i].reverse();
1035                }
1036            },
1037            Token::Choice(inside) => {
1038                let mut finish_indices = Vec::<usize>::new();
1039                let start_i = i - start_index;
1040                result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1041                i += 1;
1042                for tok in inside {
1043                    result[start_i].push((i, StateTransition::Free, vec![]));
1044                    let inside_result = compile(&vec![tok.clone()], i);
1045                    i += inside_result.len();
1046                    result.extend(inside_result);
1047                    result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1048                    finish_indices.push(i - start_index);
1049                    i += 1;
1050                }
1051                for finish_i in finish_indices {
1052                    result[finish_i].push((i, StateTransition::Free, vec![]));
1053                }
1054
1055            },
1056            Token::Group(Some(group_index), inside) => {
1057                let inside_result = compile(inside, i + 1);
1058                let inside_len = inside_result.len();
1059                result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::GroupStart(*group_index)])]);
1060                result.extend(inside_result);
1061                result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![StateFlag::GroupEnd(*group_index)])]);
1062                i += inside_len + 2;
1063            },
1064            Token::Group(None, inside) => {
1065                let inside_result = compile(inside, i);
1066                let inside_len = inside_result.len();
1067                result.extend(inside_result);
1068                i += inside_len;
1069            },
1070            Token::CharacterClass(cls) => {
1071                result.push(vec![(i + 1, StateTransition::CharacterClass(cls.clone()), vec![])]);
1072                i += 1;
1073            }
1074            Token::BitZero => {
1075                result.push(vec![(i + 1, StateTransition::EqualsBit(0), vec![])]);
1076                i += 1;
1077            },
1078            Token::BitOne => {
1079                result.push(vec![(i + 1, StateTransition::EqualsBit(1), vec![])]);
1080                i += 1;
1081            },
1082            Token::BitAny => {
1083                result.push(vec![(i + 1, StateTransition::ConsumeBits(1), vec![])]);
1084                i += 1;
1085            },
1086            Token::Nibble(n) => {
1087                result.push(vec![(i + 1, StateTransition::EqualsNibble(*n), vec![])]);
1088                i += 1;
1089            }
1090            Token::ByteAny => {
1091                result.push(vec![(i + 1, StateTransition::ConsumeBits(8), vec![])]);
1092                i += 1;
1093            },
1094            Token::ByteBoundary => {
1095                result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::NotOffsetAnd(0x07)])]);
1096                i += 1;
1097            },
1098            Token::NibbleBoundary => {
1099                result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::NotOffsetAnd(0x03)])]);
1100                i += 1;
1101            }
1102        }
1103    }
1104    result
1105}
1106
1107fn normalize_index(removed: &Vec<usize>, index: usize) -> usize {
1108    let mut result = index;
1109    for r in removed {
1110        if *r < index {
1111            result -= 1;
1112        }
1113    }
1114    result
1115}
1116
1117fn get_direct_transitions(fsm: &Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, state: usize) -> Vec<(usize, StateTransition, Vec<StateFlag>)> {
1118    let mut result = Vec::<(usize, StateTransition, Vec<StateFlag>)>::new();
1119    let mut explorer_state = state;
1120    let mut path = Vec::<(usize, usize)>::new(); // state, transition index
1121    let mut sum_flags = Vec::<Vec<StateFlag>>::new();
1122    let mut t_index = 0;
1123    let final_depth = fsm.len();
1124    // println!("Processing state {}", state);
1125    loop {
1126        let mut transitions = &fsm[explorer_state];
1127        // println!("{} {:?}", t_index, path);
1128        while t_index >= transitions.len() { // If we're out of transitions, start backtracking
1129            sum_flags.pop();
1130            match path.pop() {
1131                Some((dest, i)) => {
1132                    // println!("Backtracking");
1133                    explorer_state = dest;
1134                    transitions = &fsm[explorer_state];
1135                    t_index = i;
1136                },
1137                None => return result // If we've explored every path and we're back at the root, return
1138            }
1139        }
1140        match &transitions[t_index] {
1141            (dest, StateTransition::Free, flags) if *dest != final_depth => { // If it's a free transition, go deeper
1142                // println!("\tDeeper to state {}", dest);
1143                if path.contains(&(explorer_state, t_index + 1)) {
1144                    t_index += 1;
1145                    continue // If the path takes a loop, leave that option out of the final list.
1146                }
1147                path.push((explorer_state, t_index + 1));
1148                sum_flags.push(flags.to_vec());
1149                explorer_state = *dest;
1150                t_index = 0;
1151                continue
1152            },
1153            (dest, t, final_flags) => { // If it's not a free transition, record it
1154                // println!("\tRecording state transition {} {}", state, t_index);
1155                let mut new_flags = Vec::<StateFlag>::new();
1156                for flags in &sum_flags {
1157                    new_flags.extend(flags.to_vec());
1158                }
1159                new_flags.extend(final_flags.to_vec());
1160                result.push((*dest, t.clone(), new_flags));
1161                t_index += 1;
1162            }
1163        }
1164    }
1165}
1166
1167fn remove_dead_states(fsm: &mut Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>) {
1168    let mut valid_states = HashSet::new();
1169    valid_states.insert(0);
1170    for transitions in fsm.iter() {
1171        for (dest, _t, _flags) in transitions {
1172            valid_states.insert(*dest);
1173        }
1174    }
1175    let n_states = fsm.len();
1176    let mut removed = Vec::<usize>::new();
1177    for state in 0..n_states {
1178        if !valid_states.contains(&state) {
1179            removed.push(state);
1180        }
1181    }
1182
1183    removed.reverse();
1184
1185    for r in &removed {
1186        fsm.remove(*r);
1187    }
1188
1189    for transitions in fsm.iter_mut() {
1190        for transition in transitions {
1191            transition.0 = normalize_index(&removed, transition.0);
1192        }
1193    }
1194
1195}
1196
1197fn optimize(fsm: &mut Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>) {
1198
1199    let n_states = fsm.len();
1200    for state in 0..n_states {
1201        let new_transitions = get_direct_transitions(fsm, state);
1202        fsm[state] = new_transitions;
1203    }
1204
1205    // println!("{:?}", fsm);
1206
1207    remove_dead_states(fsm);
1208
1209}
1210
1211fn tokenize(pattern: &str) -> Result<(Vec<Token>, usize, std::collections::HashMap<String, usize>), String> {
1212    let char_vec: Vec<char> = pattern.chars().collect();
1213    tokenize_vec(&char_vec, 1)
1214}
1215
1216fn parse_group_header(input: &[char]) -> Result<(GroupType, usize), String> {
1217    let mut input_iter = input.iter();
1218    match &input_iter.next() {
1219        Some('?') => {
1220            match &input_iter.next() {
1221                Some('<') => {
1222                    let mut name = Vec::<char>::new();
1223                    let mut offset = 2;
1224                    for c in input_iter {
1225                        match c {
1226                            '>' => {
1227                                return Ok((GroupType::Named(name.into_iter().collect::<String>()), offset + 1));
1228                            },
1229                            _ => name.push(*c)
1230                        }
1231                        offset += 1;
1232                    }
1233                    Err("Unterminated group name".to_string())
1234                },
1235                Some(':') => {
1236                    Ok((GroupType::NonCapturing, 2))
1237                },
1238                _ => Err("'?' at beginning of group".to_string())
1239            }
1240        },
1241        _ => Ok((GroupType::Numbered, 0))
1242    }
1243}
1244
1245fn tokenize_vec(char_vec: &Vec<char>, start_group: usize) -> Result<(Vec<Token>, usize, std::collections::HashMap<String, usize>), String> {
1246    let mut result = Vec::<Token>::new();
1247    let mut i = 0;
1248    let mut option_flag = false;
1249    let mut groups_map = std::collections::HashMap::new();
1250    let mut group_index = start_group;
1251    loop {
1252        if i >= char_vec.len() {
1253            break
1254        }
1255        let result_len = result.len();
1256        match char_vec[i] {
1257            '_' => result.push(Token::BitZero),
1258            '\'' => result.push(Token::BitOne),
1259            '.' => result.push(Token::BitAny),
1260            '!' => result.push(Token::ByteAny),
1261            ':' => result.push(Token::ByteBoundary),
1262            ';' => result.push(Token::NibbleBoundary),
1263            '?' => {
1264                match result.pop() {
1265                    Some(last) => {
1266                        let mut greediness = Greediness::Greedy;
1267                        if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1268                            greediness = Greediness::Lazy;
1269                            i += 1;
1270                        }
1271                        result.push(Token::Repetition(Repeat::LessThan(2, greediness), Box::new(last)));
1272                    },
1273                    None => return Err("Encountered '?' at beginning of group".to_string())
1274                }
1275            },
1276            '*' => {
1277                match result.pop() {
1278                    Some(last) => {
1279                        let mut greediness = Greediness::Greedy;
1280                        if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1281                            greediness = Greediness::Lazy;
1282                            i += 1;
1283                        }
1284                        result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)));
1285                    },
1286                    None => return Err("Encountered '*' at beginning of group".to_string())
1287                }
1288            },
1289            '+' => {
1290                if result.is_empty() {
1291                    return Err("Encountered '+' at beginning of group".to_string())
1292                } else {
1293                    let mut greediness = Greediness::Greedy;
1294                    if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1295                        greediness = Greediness::Lazy;
1296                        i += 1;
1297                    }
1298                    let last = result[result.len() - 1].clone();
1299                    result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)))
1300                }
1301            },
1302            '|' => {
1303                option_flag = true;
1304            },
1305            '[' => {
1306                match find_right_delimiter(char_vec, i, '[', ']') {
1307                    Some(end_index) => {
1308                        match parse_char_class(&char_vec[i+1..end_index].to_vec()) {
1309                            Ok(token) => result.push(token),
1310                            Err(msg) => return Err(msg)
1311                        }
1312                        i = end_index;
1313                    },
1314                    None => return Err("Unclosed opening delimiter '['".to_string())
1315                }
1316            },
1317            '{' => {
1318                match find_right_delimiter(char_vec, i, '{', '}') {
1319                    Some(end_index) => {
1320                        let inside = &char_vec[i+1..end_index];
1321                        i = end_index;
1322
1323                        let mut greediness = Greediness::Greedy;
1324                        if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1325                            greediness = Greediness::Lazy;
1326                            i += 1;
1327                        }
1328
1329                        match inside.iter().position(|x| *x == ',') {
1330                            Some(sep) => {
1331                                if sep + 1 == inside.len() { // {n,}
1332                                    match inside[..sep].iter().collect::<String>().parse::<usize>() {
1333                                        Ok(n) => {
1334                                            match result.pop() {
1335                                                Some(last) => {
1336                                                    result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last.clone())));
1337                                                    result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)));
1338                                                },
1339                                                None => return Err("Encountered '{' at beginning of group".to_string())
1340                                            }
1341                                        },
1342                                        Err(msg) => return Err(msg.to_string())
1343                                    }
1344                                } else { // {n,m}
1345                                    match (inside[..sep].iter().collect::<String>().parse::<usize>(),
1346                                            inside[1+sep..].iter().collect::<String>().parse::<usize>())
1347                                    {
1348                                        (Ok(n), Ok(m)) => {
1349                                            match result.pop() {
1350                                                Some(last) => {
1351                                                    result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last.clone())));
1352                                                    result.push(Token::Repetition(Repeat::LessThan(m - n + 1, greediness), Box::new(last)));
1353                                                },
1354                                                None => return Err("Encountered '{' at beginning of group".to_string())
1355                                            }
1356                                        },
1357                                        (Err(msg), _) => return Err(msg.to_string()),
1358                                        (_, Err(msg)) => return Err(msg.to_string()),
1359                                    }
1360                                }
1361                            },
1362                            None => { // {n}
1363                                match inside.iter().collect::<String>().parse::<usize>() {
1364                                    Ok(n) => {
1365                                        match result.pop() {
1366                                            Some(last) => result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last))),
1367                                            None => return Err("Encountered '{' at beginning of group".to_string())
1368                                        }
1369                                    },
1370                                    Err(msg) => return Err(msg.to_string())
1371                                } 
1372                            }
1373                        }
1374                    },
1375                    None => return Err("Unclosed opening delimiter '{'".to_string())
1376                }
1377            },
1378            '(' => {
1379                match find_right_delimiter(char_vec, i, '(', ')') {
1380                    Some(end_index) => {
1381                        match parse_group_header(&char_vec[i+1..end_index]) {
1382                            Ok((GroupType::Numbered, start_index)) => {
1383                                match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index + 1) {
1384                                    Ok((tokens, new_groups, new_groups_map)) => {
1385                                        result.push(Token::Group(Some(group_index), tokens));
1386                                        group_index += new_groups + 1;
1387                                        for (key, val) in new_groups_map.iter() {
1388                                            if groups_map.contains_key(key) {
1389                                                return Err(format!("Group name '{}' used more than once", key))
1390                                            } else {
1391                                                groups_map.insert(key.clone(), *val);
1392                                            }
1393                                        }
1394                                    },
1395                                    Err(msg) => return Err(msg)
1396                                } 
1397                                i = end_index;
1398                            },
1399                            Ok((GroupType::Named(name), start_index)) => {
1400                                if groups_map.contains_key(&name) {
1401                                    return Err(format!("Group name '{}' used more than once", name))
1402                                } else {
1403                                    groups_map.insert(name, group_index);
1404                                }
1405                                match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index + 1) {
1406                                    Ok((tokens, new_groups, new_groups_map)) => {
1407                                        result.push(Token::Group(Some(group_index), tokens));
1408                                        group_index += new_groups + 1;
1409                                        for (key, val) in new_groups_map.iter() {
1410                                            if groups_map.contains_key(key) {
1411                                                return Err(format!("Group name '{}' used more than once", key))
1412                                            } else {
1413                                                groups_map.insert(key.clone(), *val);
1414                                            }
1415                                        }
1416                                    },
1417                                    Err(msg) => return Err(msg)
1418                                } 
1419                                i = end_index;
1420                            },
1421                            Ok((GroupType::NonCapturing, start_index)) => {
1422                                match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index) {
1423                                    Ok((tokens, new_groups, new_groups_map)) => {
1424                                        result.push(Token::Group(None, tokens));
1425                                        group_index += new_groups;
1426                                        for (key, val) in new_groups_map.iter() {
1427                                            if groups_map.contains_key(key) {
1428                                                return Err(format!("Group name '{}' used more than once", key))
1429                                            } else {
1430                                                groups_map.insert(key.clone(), *val);
1431                                            }
1432                                        }
1433                                    },
1434                                    Err(msg) => return Err(msg)
1435                                } 
1436                                i = end_index;
1437                            },
1438                            _ => todo!()
1439                        }
1440
1441                    },
1442                    None => return Err("Unclosed opening delimiter '('".to_string())
1443                }
1444            },
1445            c => {
1446                match c {
1447                    '0'..='9' => {
1448                        result.push(Token::Nibble(c as u8 - 48));
1449                    },
1450                    'a'..='f' => {
1451                        result.push(Token::Nibble(c as u8 - 87));
1452                    },
1453                    'A'..='F' => {
1454                        result.push(Token::Nibble(c as u8 - 55));
1455                    },
1456                    _ => return Err(format!("Encountered unexpected character '{}'", c))
1457                }
1458            }
1459        }
1460        if option_flag && result.len() != result_len { // If a token was added after the option delimiter was spotted
1461            match result.pop() {
1462                Some(last) => {
1463                    match result.pop() {
1464                        Some(Token::Choice(options)) => {
1465                            let mut options = options.to_vec();
1466                            options.push(last);
1467                            result.push(Token::Choice(options));
1468                            option_flag = false;
1469                        },
1470                        Some(token) => {
1471                            result.push(Token::Choice(vec![token, last]));
1472                            option_flag = false;
1473                        },
1474                        None => return Err("Encountered unexpected character '|'".to_string())
1475                    }
1476                },
1477                None => return Err("Encountered '|' at beginning of group".to_string())
1478            }
1479        }
1480        i += 1;
1481    }
1482    Ok((result, group_index - start_group, groups_map))
1483}
1484
1485fn check_state_flags<T: BitIndexable>(input: &T, flags: &Vec<StateFlag>, offset: BitIndex) -> bool {
1486    for flag in flags {
1487        match flag {
1488            StateFlag::GroupStart(_) | StateFlag::GroupEnd(_) => (),
1489            StateFlag::NotOffsetAnd(value) => {
1490                if (offset.bit() as usize & value) != 0 {
1491                    return false
1492                }
1493            }
1494        }
1495    }
1496    true
1497}
1498
1499
1500/// Structure to contain a single [`BinRegex`] match.
1501///
1502/// # Examples
1503/// ```rust
1504/// use bitutils2::{BinRegex, BitIndex};
1505///
1506/// let input = vec![0x00, 0x0A, 0xBC, 0xD0, 0xAB];
1507///
1508/// let mut re = BinRegex::new(".*A(BC)(F?)").unwrap();
1509///
1510/// let cap = re.captures(&input).unwrap();
1511///
1512/// let m0 = cap.get(0).unwrap();
1513/// assert_eq!(m0.start(), BitIndex::new(0, 0));
1514/// assert_eq!(m0.end(), BitIndex::new(3, 0));
1515/// assert_eq!(m0.span(), (BitIndex::new(0, 0), BitIndex::new(3, 0)));
1516/// assert_eq!(m0.is_empty(), false);
1517///
1518/// let m1 = cap.get(1).unwrap();
1519/// assert_eq!(m1.start(), BitIndex::new(2, 0));
1520/// assert_eq!(m1.end(), BitIndex::new(3, 0));
1521/// assert_eq!(m1.span(), (BitIndex::new(2, 0), BitIndex::new(3, 0)));
1522/// assert_eq!(m1.is_empty(), false);
1523///
1524/// let m2 = cap.get(2).unwrap();
1525/// assert_eq!(m2.start(), BitIndex::new(3, 0));
1526/// assert_eq!(m2.end(), BitIndex::new(3, 0));
1527/// assert_eq!(m2.span(), (BitIndex::new(3, 0), BitIndex::new(3, 0)));
1528/// assert_eq!(m2.is_empty(), true);
1529///```
1530#[derive(PartialEq, Eq, Debug, Clone, Copy)]
1531pub struct BinMatch<'a, T: BitIndexable> {
1532    input: &'a T,
1533    start: BitIndex,
1534    end: BitIndex
1535}
1536
1537impl<'a, T: BitIndexable> BinMatch<'a, T> {
1538
1539    /// Basic constructor for structure
1540    pub fn new(input: &'a T, start: BitIndex, end: BitIndex) -> BinMatch<'a, T> {
1541        BinMatch {input, start, end}
1542    }
1543
1544    /// Accesses the start index of the match
1545    pub fn start(&self) -> BitIndex {
1546        self.start
1547    }
1548
1549    /// Accesses the end index of the match
1550    pub fn end(&self) -> BitIndex {
1551        self.end
1552    }
1553
1554    /// Returns `true` if the match is empty (`start == end`)
1555    pub fn is_empty(&self) -> bool {
1556        self.start == self.end
1557    }
1558
1559    /// Returns a tuple with the start and end indices of the match
1560    pub fn span(&self) -> (BitIndex, BitIndex) {
1561        (self.start, self.end)
1562    }
1563
1564    /// Returns the contents of the match as a [`BitField`]
1565    pub fn as_bf(&self) -> BitField {
1566        self.input.bit_slice(&self.start, &self.end)
1567    }
1568}
1569
1570/// Iterator that yields successive non-overlapping matches of the provided input.
1571/// There is no public constructor for this structure, instead users must use
1572/// [`BinRegex.find_iter`](crate::BinRegex::find_iter).
1573pub struct BinMatches<'a, T: BitIndexable> {
1574    gen: CapturePathGenerator<'a, T>,
1575    max_index: BitIndex,
1576    current_offset: BitIndex
1577}
1578
1579impl<'a, T: BitIndexable> BinMatches<'a, T> {
1580    fn new(fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, input: &'a T) -> BinMatches<'a, T> {
1581        let gen = CapturePathGenerator::new(fsm, input);
1582        let max_index = gen.input.max_index();
1583        BinMatches {gen, max_index, current_offset: BitIndex::new(0, 0)}
1584    }
1585}
1586
1587impl<'a, T: BitIndexable> std::iter::Iterator for BinMatches<'a, T> {
1588    type Item = BinMatch<'a, T>;
1589
1590    fn next(&mut self) -> Option<BinMatch<'a, T>> {
1591        while self.current_offset < self.max_index {
1592            self.gen.reset(self.current_offset);
1593            if let Some((_, initial_offset)) = self.gen.next() {
1594                let mut max_offset = initial_offset;
1595                loop {
1596                    match self.gen.next() {
1597                        Some((_, offset)) if offset > max_offset => {
1598                            max_offset = offset;
1599                        },
1600                        None => {
1601                            let res = Some(BinMatch::new(self.gen.input, self.current_offset, max_offset));
1602                            self.current_offset = max_offset;
1603                            return res
1604                        },
1605                        _ => ()
1606                    }
1607                }
1608            }
1609            self.current_offset += 1;
1610        }
1611        None
1612    }
1613}
1614
1615/// Represents the capture groups from a single match
1616pub struct BinCaptures<'a, T: BitIndexable> {
1617    groups: Vec<Option<BinMatch<'a, T>>>,
1618    groups_map: &'a std::collections::HashMap<String, usize>
1619}
1620
1621impl<'a, T: BitIndexable> BinCaptures<'a, T> {
1622
1623    /// Returns the capture group associated with the index `i` if that group
1624    /// matched any part of the input. Returns `None` otherwise. Capture groups
1625    /// are numbered from left to right by the position of the left parenthesis,
1626    /// starting at 1. Group 0 refers to the entire match.
1627    pub fn get(&self, i: usize) -> Option<BinMatch<T>> {
1628        if i < self.groups.len() {
1629            match &self.groups[i] {
1630                Some(m) => Some(BinMatch::new(m.input, m.start, m.end)),
1631                None => None
1632            }
1633        } else {
1634            None
1635        }
1636    }
1637
1638    /// Returns the capture group associated with the name `name` if a group
1639    /// with that name exists in the expression and matched any part of the
1640    /// input. Returns `None` otherwise.
1641    pub fn name(&self, name: &str) -> Option<BinMatch<T>> {
1642        match self.groups_map.get(&name.to_string()) {
1643            Some(index) => self.get(*index),
1644            None => None
1645        }
1646    }
1647}
1648
1649struct CapturePathGenerator<'a, T: BitIndexable> {
1650    fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>,
1651    input: &'a T,
1652    current_path: Vec::<(usize, usize, BitIndex)>,
1653    max_offset: BitIndex,
1654    current_offset: BitIndex,
1655    current_state: usize,
1656    t_index: usize
1657}
1658
1659impl<'a, T: BitIndexable> CapturePathGenerator<'a, T> {
1660
1661    fn new(fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, input: &'a T) -> CapturePathGenerator<'a, T> {
1662        CapturePathGenerator {
1663            fsm,
1664            input,
1665            current_path: Vec::<(usize, usize, BitIndex)>::new(),
1666            max_offset: input.max_index(),
1667            current_offset: BitIndex::new(0, 0),
1668            current_state: 0,
1669            t_index: 0
1670        }
1671    }
1672
1673    fn reset(&mut self, offset: BitIndex) {
1674        self.current_path = Vec::<(usize, usize, BitIndex)>::new();
1675        self.current_offset = offset;
1676        self.current_state = 0;
1677        self.t_index = 0;
1678    }
1679
1680    fn try_backtrack(&mut self) -> bool {
1681        // Backtracks if possible and returns true. If not possible, returns false.
1682        match self.current_path.pop() {
1683            Some((dest, i, new_offset)) => {
1684                self.current_state = dest;
1685                self.t_index = i + 1;
1686                self.current_offset = new_offset;
1687                true
1688            },
1689            None => false
1690        }
1691    }
1692
1693    fn next(&mut self) -> Option<(Vec::<(usize, usize, BitIndex)>, BitIndex)> {
1694        self.try_backtrack();
1695        loop {
1696            let mut transitions = &self.fsm[self.current_state];
1697            loop {
1698
1699                while self.t_index >= transitions.len() {
1700                    if !self.try_backtrack() {
1701                        return None
1702                    }
1703                    transitions = &self.fsm[self.current_state];
1704                }
1705
1706                let (dest, transition, flags) = &transitions[self.t_index];
1707                if check_state_flags(self.input, flags, self.current_offset) {
1708                    match transition {
1709                        StateTransition::Free => {
1710                            self.current_path.push((self.current_state, self.t_index, self.current_offset));
1711                            self.current_state = *dest;
1712                            break;
1713                        },
1714                        StateTransition::EqualsBit(value) => {
1715                            if self.current_offset < self.max_offset && self.input.bit_at(&self.current_offset) == *value {
1716                                self.current_path.push((self.current_state, self.t_index, self.current_offset));
1717                                self.current_state = *dest;
1718                                self.current_offset += 1;
1719                                break;
1720                            }
1721                        },
1722                        StateTransition::ConsumeBits(n) => {
1723                            if self.current_offset + n <= self.max_offset {
1724                                self.current_path.push((self.current_state, self.t_index, self.current_offset));
1725                                self.current_state = *dest;
1726                                self.current_offset += n;
1727                                break;
1728                            }
1729                        },
1730                        StateTransition::EqualsNibble(n) => {
1731                            if self.current_offset + 3 < self.max_offset {
1732                                let n2 = (self.input.bit_at(&self.current_offset) << 3) | (self.input.bit_at(&(self.current_offset + 1)) << 2) | 
1733                                        (self.input.bit_at(&(self.current_offset + 2)) << 1) | self.input.bit_at(&(self.current_offset + 3));
1734                                if *n == n2 {
1735                                    self.current_path.push((self.current_state, self.t_index, self.current_offset));
1736                                    self.current_state = *dest;
1737                                    self.current_offset += 4;
1738                                    break;   
1739                                }
1740                            }                       
1741                        },
1742                        StateTransition::CharacterClass(cls) => {
1743                            let n = cls.input_length();
1744                            let end = self.current_offset + n;
1745                            if end <= self.max_offset {
1746                                let bf = self.input.bit_slice(&self.current_offset, &end);
1747                                // println!("{:?}, {}", bf, cls.matches(&bf));
1748                                if cls.matches(&bf) {
1749                                    self.current_path.push((self.current_state, self.t_index, self.current_offset));
1750                                    self.current_state = *dest;
1751                                    self.current_offset = end;
1752                                    break;
1753                                }
1754                            }
1755                        }
1756                    }
1757                }
1758                
1759                self.t_index += 1;
1760            }
1761            self.t_index = 0;
1762            if self.current_state >= self.fsm.len() {
1763
1764                return Some((self.current_path.clone(), self.current_offset))
1765            }
1766            
1767        }
1768    }
1769}
1770
1771/// A compiled variation of regular expressions intended for searching binary data. A
1772/// [`BinRegex`] can be used to search binary data for patterns
1773pub struct BinRegex {
1774    fsm: Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>,
1775    pub n_groups: usize,
1776    groups_map: std::collections::HashMap<String, usize>
1777}
1778
1779impl BinRegex {
1780
1781    /// Compiles a binary regular expression. Once compiled, this object can be used 
1782    /// repeatedly to search input buffers.
1783    pub fn new(pattern: &str) -> Result<BinRegex, String>{
1784        let (tokens, n_groups, groups_map) = tokenize(pattern)?;
1785        let n_groups = n_groups + 1;
1786        let mut fsm = compile(&tokens, 0);
1787        optimize(&mut fsm);
1788        // println!("{:?}", groups_map);
1789        Ok(BinRegex {fsm, n_groups, groups_map})
1790    }
1791
1792
1793    fn match_path<'a, T>(&self, input: &'a T) -> Option<(Vec::<(usize, usize, BitIndex)>, BitIndex)> 
1794    where &'a T: BitIndexable {
1795        let mut gen = CapturePathGenerator::new(&self.fsm, &input);
1796        match gen.next() {
1797            Some((path, offset)) => {
1798                let mut max_offset = offset;
1799                let mut current_path = path;
1800                loop {
1801                    match gen.next() {
1802                        Some((path, offset)) if offset > max_offset => {
1803                            max_offset = offset;
1804                            current_path = path
1805                        },
1806                        None => return Some((current_path, max_offset)),
1807                        _ => ()
1808                    }
1809                }
1810            },
1811            None => None
1812        }
1813    }
1814
1815    /// Searches for the first match in the input given, and if found returns a [`BinMatch`]
1816    /// object corresponding to the match. If no match is found, returns `None`. 
1817    ///
1818    /// # Examples
1819    ///```rust
1820    /// use bitutils2::{BinRegex, BitIndex};
1821    ///
1822    /// let input = vec![0x00, 0x0a, 0xbc, 0x00, 0x00, 0x00, 0xff, 0x00];
1823    ///
1824    /// let re1 = BinRegex::new("ABC_{8,}''").unwrap();
1825    /// let m1 = re1.find(&input).unwrap();
1826    /// assert_eq!(m1.span(), (BitIndex::new(1, 4), BitIndex::new(6, 2)));
1827    ///
1828    /// let re2 = BinRegex::new("ABC_{8,}'_").unwrap();
1829    /// assert_eq!(re2.find(&input), None);
1830    ///```
1831    pub fn find<'a, T>(&self, input: &'a T) -> Option<BinMatch<'a, T>> 
1832    where &'a T: BitIndexable, T: BitIndexable {
1833        let mut gen = CapturePathGenerator::new(&self.fsm, input);
1834
1835        let mut i = BitIndex::new(0, 0);
1836        let max_index = input.max_index();
1837        while i < max_index {
1838            gen.reset(i);
1839            if let Some((_, initial_offset)) = gen.next() {
1840                let mut max_offset = initial_offset;
1841                loop {
1842                    match gen.next() {
1843                        Some((_, offset)) if offset > max_offset => {
1844                            max_offset = offset;
1845                        },
1846                        None => return Some(BinMatch::new(input, i, max_offset)),
1847                        _ => ()
1848                    }
1849                }
1850            }
1851            i += 1;
1852        }
1853        None
1854    }
1855
1856    /// Returns an iterator that yields successive non-overlapping matches in the input buffer.
1857    pub fn find_iter<'a, T>(&'a self, input: &'a T) -> BinMatches<'a, T>
1858    where &'a T: BitIndexable, T: BitIndexable {
1859        BinMatches::new(&self.fsm, input)
1860
1861    }
1862
1863    pub fn match_length<'a, T>(&'a self, input: &'a T) -> Option<BitIndex> 
1864    where &'a T: BitIndexable {
1865        match self.match_path(input) {
1866            Some((_, offset)) => Some(offset),
1867            None => None
1868        }
1869    }
1870
1871    /// This routine searches for the first match of this regex in the input given, and if found, 
1872    /// returns not only the overall match but also the matches of each capture group in the 
1873    /// expression. If no match is found, then `None` is returned.
1874    pub fn captures<'a, T>(&'a self, input: &'a T) -> Option<BinCaptures<T>> 
1875    where &'a T: BitIndexable, T: BitIndexable {
1876        match self.match_path(input) {
1877            Some((mut path, offset)) => {
1878                // println!("{:?}",path);
1879                let mut groups = Vec::<Option<BinMatch<T>>>::new();
1880                for _ in 0..self.n_groups {
1881                    groups.push(None);
1882                }
1883
1884                let mut group_ends = std::collections::HashMap::new();
1885                path.reverse();
1886                for (state, t_index, offset) in path {
1887                    let mut flags = self.fsm[state][t_index].2.to_vec();
1888                    // println!("{:?}", flags);
1889                    flags.reverse();
1890                    for flag in flags {
1891                        match flag {
1892                            StateFlag::GroupEnd(group_index) => {
1893                                group_ends.insert(group_index, offset);
1894                            },
1895                            StateFlag::GroupStart(group_index) => {
1896                                match &groups[group_index] {
1897                                    None => 
1898                                        match group_ends.get(&group_index) {
1899                                            Some(end_index) => groups[group_index] = Some(BinMatch::new(input, offset, *end_index)),
1900                                            None => panic!("Group start with no end")
1901                                        },
1902                                    Some(bin_match) if bin_match.is_empty() => { // It's okay to overwrite the last match if it's empty
1903                                        match group_ends.get(&group_index) {
1904                                            Some(end_index) if *end_index != offset => groups[group_index] = Some(BinMatch::new(input, offset, *end_index)),
1905                                            _ => ()
1906                                        }
1907                                    },
1908                                    _ => ()
1909                                }
1910                            },
1911                            _ => ()
1912                        }
1913                    }
1914                }
1915                groups[0] = Some(BinMatch::new(input, BitIndex::new(0, 0), offset));
1916                Some(BinCaptures {groups, groups_map: &self.groups_map})
1917            }
1918            None => None
1919        }
1920    }
1921}
1922
1923#[cfg(test)]
1924mod tests {
1925    use super::*;
1926
1927    #[test]
1928    fn uint_be_char_class() {
1929        let input = "u5:0,1,23".chars().collect();
1930        let res = parse_char_class(&input).unwrap();
1931        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1932            inverted: false, 
1933            endian: Endian::Big, 
1934            options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1935            ranges: vec![], 
1936            input_length: bx!(,5)
1937        });
1938        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1939
1940        let input = ">u6:^0,1,23".chars().collect();
1941        let res = parse_char_class(&input).unwrap();
1942        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1943            inverted: true, 
1944            endian: Endian::Big, 
1945            options: HashSet::from([UInt(0), UInt(1), UInt(23)]), 
1946            ranges: vec![], 
1947            input_length: bx!(,6)
1948        });
1949        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1950
1951        let input = "u8:^0,50..64,1,23,100..128".chars().collect();
1952        let res = parse_char_class(&input).unwrap();
1953        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1954            inverted: true, 
1955            endian: Endian::Big, 
1956            options: HashSet::from([UInt(0), UInt(1), UInt(23)]),  
1957            ranges: vec![(UInt(50),UInt(64)), (UInt(100), UInt(128))], 
1958            input_length: bx!(,8)
1959        });
1960        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1961
1962        let input = ">u120:^0,51230..641234,1,23000,100..128".chars().collect();
1963        let res = parse_char_class(&input).unwrap();
1964        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1965            inverted: true, 
1966            endian: Endian::Big,
1967            options: HashSet::from([UInt(0), UInt(1), UInt(23000)]),  
1968            ranges: vec![(UInt(51230),UInt(641234)), (UInt(100), UInt(128))], 
1969            input_length: bx!(,120)
1970        });
1971        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1972    }
1973
1974    #[test]
1975    fn uint_le_char_class() {
1976        let input = "<u5:0,1,23".chars().collect();
1977        let res = parse_char_class(&input).unwrap();
1978        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1979            inverted: false, 
1980            endian: Endian::Little, 
1981            options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1982            ranges: vec![], 
1983            input_length: bx!(,5)
1984        });
1985        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1986
1987        let input = "<u6:^0,1,23".chars().collect();
1988        let res = parse_char_class(&input).unwrap();
1989        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1990            inverted: true, 
1991            endian: Endian::Little, 
1992            options: HashSet::from([UInt(0), UInt(1), UInt(23)]), 
1993            ranges: vec![], 
1994            input_length: bx!(,6)
1995        });
1996        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1997
1998        let input = "<u8:^0,50..64,1,23,100..128".chars().collect();
1999        let res = parse_char_class(&input).unwrap();
2000        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2001            inverted: true, 
2002            endian: Endian::Little, 
2003            options: HashSet::from([UInt(0), UInt(1), UInt(23)]),  
2004            ranges: vec![(UInt(50),UInt(64)), (UInt(100), UInt(128))], 
2005            input_length: bx!(,8)
2006        });
2007        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2008
2009        let input = "<u120:^0,51230..641234,1,23000,100..128".chars().collect();
2010        let res = parse_char_class(&input).unwrap();
2011        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2012            inverted: true, 
2013            endian: Endian::Little,
2014            options: HashSet::from([UInt(0), UInt(1), UInt(23000)]),  
2015            ranges: vec![(UInt(51230),UInt(641234)), (UInt(100), UInt(128))], 
2016            input_length: bx!(,120)
2017        });
2018        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2019    }
2020
2021    #[test]
2022    fn twos_compliment_be_char_class() {
2023        let input = "i5:0,-1,+23".chars().collect();
2024        let res = parse_char_class(&input).unwrap();
2025        let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2026            inverted: false, 
2027            endian: Endian::Big, 
2028            options: HashSet::from([TwosCompInt(0), TwosCompInt(-1), TwosCompInt(23)]),
2029            ranges: vec![], 
2030            input_length: bx!(,5)
2031        });
2032        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2033
2034        let input = ">i6:^+0,1,-23".chars().collect();
2035        let res = parse_char_class(&input).unwrap();
2036        let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2037            inverted: true, 
2038            endian: Endian::Big, 
2039            options: HashSet::from([TwosCompInt(0), TwosCompInt(1), TwosCompInt(-23)]), 
2040            ranges: vec![], 
2041            input_length: bx!(,6)
2042        });
2043        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2044
2045        let input = "i8:^-0,-50..+64,-1,+23,-130..-100".chars().collect();
2046        let res = parse_char_class(&input).unwrap();
2047        let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2048            inverted: true, 
2049            endian: Endian::Big, 
2050            options: HashSet::from([TwosCompInt(0), TwosCompInt(-1), TwosCompInt(23)]),  
2051            ranges: vec![(TwosCompInt(-50), TwosCompInt(64)), (TwosCompInt(-130), TwosCompInt(-100))], 
2052            input_length: bx!(,8)
2053        });
2054        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2055
2056        let input = ">i120:^+0,-521230..-641234,1,-23000,+100..128".chars().collect();
2057        let res = parse_char_class(&input).unwrap();
2058        let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2059            inverted: true, 
2060            endian: Endian::Big,
2061            options: HashSet::from([TwosCompInt(0), TwosCompInt(1), TwosCompInt(-23000)]),  
2062            ranges: vec![(TwosCompInt(-521230), TwosCompInt(-641234)), (TwosCompInt(100), TwosCompInt(128))], 
2063            input_length: bx!(,120)
2064        });
2065        assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2066    }
2067
2068    #[test]
2069    fn ascii_char_class() {
2070        let input = "abc".chars().collect();
2071        let res = parse_char_class(&input);
2072        let mut options = HashSet::new();
2073        options.insert(UInt(97));
2074        options.insert(UInt(98));
2075        options.insert(UInt(99));
2076        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2077            inverted: false, 
2078            endian: Endian::Big, 
2079            options, 
2080            ranges: vec![], 
2081            input_length: bx!(,8)
2082        });
2083        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2084
2085        let input = "a8:^abc".chars().collect();
2086        let res = parse_char_class(&input);
2087        let mut options = HashSet::new();
2088        options.insert(UInt(97));
2089        options.insert(UInt(98));
2090        options.insert(UInt(99));
2091        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2092            inverted: true, 
2093            endian: Endian::Big, 
2094            options, 
2095            ranges: vec![], 
2096            input_length: bx!(,8)
2097        });
2098        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2099
2100        let input = "\\\\a\\bc".chars().collect();
2101        let res = parse_char_class(&input);
2102        let mut options = HashSet::new();
2103        options.insert(UInt(92));
2104        options.insert(UInt(97));
2105        options.insert(UInt(98));
2106        options.insert(UInt(99));
2107        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2108            inverted: false, 
2109            endian: Endian::Big, 
2110            options, 
2111            ranges: vec![], 
2112            input_length: bx!(,8)
2113        });
2114        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2115
2116        let input = "a8:abc0-9".chars().collect();
2117        let res = parse_char_class(&input);
2118        let mut options = HashSet::new();
2119        options.insert(UInt(97));
2120        options.insert(UInt(98));
2121        options.insert(UInt(99));
2122        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2123            inverted: false, 
2124            endian: Endian::Big, 
2125            options, 
2126            ranges: vec![(UInt(48), UInt(57))], 
2127            input_length: bx!(,8)
2128        });
2129        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2130
2131        let input = "abc0-9\\--z".chars().collect();
2132        let res = parse_char_class(&input);
2133        let mut options = HashSet::new();
2134        options.insert(UInt(97));
2135        options.insert(UInt(98));
2136        options.insert(UInt(99));
2137        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2138            inverted: false, 
2139            endian: Endian::Big, 
2140            options, 
2141            ranges: vec![(UInt(48), UInt(57)), (UInt(45), UInt(122))], 
2142            input_length: bx!(,8)
2143        });
2144        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2145
2146        let input = "a8:^0-9abc0-9d\\--zef".chars().collect();
2147        let res = parse_char_class(&input);
2148        let mut options = HashSet::new();
2149        options.insert(UInt(97));
2150        options.insert(UInt(98));
2151        options.insert(UInt(99));
2152        options.insert(UInt(100));
2153        options.insert(UInt(101));
2154        options.insert(UInt(102));
2155        let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2156            inverted: true, 
2157            endian: Endian::Big, 
2158            options, 
2159            ranges: vec![(UInt(48), UInt(57)), (UInt(48), UInt(57)), (UInt(45), UInt(122))], 
2160            input_length: bx!(,8)
2161        });
2162        assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2163    }
2164
2165    #[test]
2166    fn group_headers() {
2167        let input = "___''..".chars().collect::<Vec<char>>();
2168        assert_eq!(parse_group_header(&input), Ok((GroupType::Numbered, 0)));
2169        let input = "?:___''..".chars().collect::<Vec<char>>();
2170        assert_eq!(parse_group_header(&input), Ok((GroupType::NonCapturing, 2)));
2171        let input = "?<hello>___''..".chars().collect::<Vec<char>>();
2172        assert_eq!(parse_group_header(&input), Ok((GroupType::Named("hello".to_string()), 8)));
2173    }
2174
2175    #[test]
2176    fn groups_basic() {
2177        assert_eq!(BinRegex::new("_'.").unwrap().n_groups, 1);
2178        assert_eq!(BinRegex::new("(_'){5}_{2}").unwrap().n_groups, 2);
2179        assert_eq!(BinRegex::new("(_'){2,}_{2,}").unwrap().n_groups, 2);
2180        assert_eq!(BinRegex::new("(_'){2,4}_{1,10}").unwrap().n_groups, 2);
2181        assert_eq!(BinRegex::new("(_'){4,8}(_){1,2}").unwrap().n_groups, 3);
2182        assert_eq!(BinRegex::new("(_')+_+").unwrap().n_groups, 2);
2183        assert_eq!(BinRegex::new("((('')|(_'))*|_)*f0").unwrap().n_groups, 5);
2184    }
2185}