1use std::collections::HashSet;
69
70mod parse_helpers;
71use crate::bin_regex::parse_helpers::{escape_vec, find_right_delimiter};
72
73use crate::bx;
74use crate::bit_index::{BitIndex, BitIndexable};
75use crate::bit_field::{IntFormat, BitField, FromBitField};
76
77#[derive(Debug, Clone, PartialEq)]
78enum Greediness {
79 Greedy,
80 Lazy
81}
82
83#[derive(Debug, Clone, PartialEq)]
84enum Repeat {
85 Exactly(usize),
86 LessThan(usize, Greediness),
87 Any(Greediness)
88}
89
90#[derive(Debug, Clone, PartialEq)]
91enum GroupType {
92 NonCapturing,
93 Numbered,
94 Named(String)
95}
96
97
98#[derive(Debug, PartialEq)]
99enum DynamicCharacterClass {
100 UInt(CharClass<UInt>),
101 TwosCompInt(CharClass<TwosCompInt>),
102 OnesCompInt(CharClass<OnesCompInt>),
103 SignMagInt(CharClass<SignMagInt>),
104 NegaInt(CharClass<NegaInt>),
105 Float16(CharClass<SembFloat<1, 5, 10, 15>>),
106 Float32(CharClass<SembFloat<1, 8, 23, 127>>),
107 Float64(CharClass<SembFloat<1, 11, 52, 1023>>),
108 Float128(CharClass<SembFloat<1, 15, 112, 16383>>)
109}
110
111impl DynamicCharacterClass {
112 fn input_length(&self) -> BitIndex {
113 match self {
114 DynamicCharacterClass::UInt(cls) => cls.input_length(),
115 DynamicCharacterClass::TwosCompInt(cls) => cls.input_length(),
116 DynamicCharacterClass::OnesCompInt(cls) => cls.input_length(),
117 DynamicCharacterClass::SignMagInt(cls) => cls.input_length(),
118 DynamicCharacterClass::NegaInt(cls) => cls.input_length(),
119 DynamicCharacterClass::Float16(cls) => cls.input_length(),
120 DynamicCharacterClass::Float32(cls) => cls.input_length(),
121 DynamicCharacterClass::Float64(cls) => cls.input_length(),
122 DynamicCharacterClass::Float128(cls) => cls.input_length(),
123 }
124 }
125
126 fn matches(&self, input: &BitField) -> bool {
127 match self {
128 DynamicCharacterClass::UInt(cls) => cls.matches(input),
129 DynamicCharacterClass::TwosCompInt(cls) => cls.matches(input),
130 DynamicCharacterClass::OnesCompInt(cls) => cls.matches(input),
131 DynamicCharacterClass::SignMagInt(cls) => cls.matches(input),
132 DynamicCharacterClass::NegaInt(cls) => cls.matches(input),
133 DynamicCharacterClass::Float16(cls) => cls.matches(input),
134 DynamicCharacterClass::Float32(cls) => cls.matches(input),
135 DynamicCharacterClass::Float64(cls) => cls.matches(input),
136 DynamicCharacterClass::Float128(cls) => cls.matches(input)
137 }
138 }
139}
140
141#[derive(Debug, Clone, PartialEq)]
142enum Token {
143 Repetition(Repeat, Box<Token>),
144 Group(Option<usize>, Vec<Token>),
145 Choice(Vec<Token>),
146 CharacterClass(std::rc::Rc<DynamicCharacterClass>),
147 Nibble(u8),
148 ByteBoundary,
149 NibbleBoundary,
150 BitZero,
151 BitOne,
152 BitAny,
153 ByteAny,
154}
155
156#[derive(Debug, Clone)]
157enum StateFlag {
158 GroupStart(usize),
159 GroupEnd(usize),
160 NotOffsetAnd(usize) }
162
163#[derive(Debug, Clone)]
164enum StateTransition {
165 Free,
166 EqualsBit(u8),
167 EqualsNibble(u8),
168 ConsumeBits(usize),
169 CharacterClass(std::rc::Rc<DynamicCharacterClass>)
170}
171
172#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
227struct UInt(u128);
228
229impl FromBitField for UInt {
230 fn from_bf_be(bf: &BitField) -> UInt {
231 let mut bf = bf.clone();
232 let n = bf.len().total_bits();
233 if n > 128 {
234 bf.truncate_be(BitIndex::bits(128));
235 } else if n < 128 {
236 bf.pad_unsigned_be(BitIndex::bits(128));
237 }
238
239 UInt(u128::from_be_bytes(bf.into_array().unwrap()))
240 }
241
242 fn from_bf_le(bf: &BitField) -> UInt {
243 let mut bf = bf.clone();
244 let n = bf.len().total_bits();
245 if n > 128 {
246 bf.truncate_le(BitIndex::bits(128));
247 } else if n < 128 {
248 bf.pad_unsigned_le(BitIndex::bits(128));
249 }
250
251 UInt(u128::from_le_bytes(bf.into_array().unwrap()))
252 }
253}
254
255impl std::str::FromStr for UInt {
256 type Err = std::num::ParseIntError;
257
258 fn from_str(s: &str) -> Result<Self, Self::Err> {
259 Ok(UInt(u128::from_str(s)?))
260 }
261}
262
263impl std::cmp::PartialOrd for UInt {
264 fn partial_cmp(&self, other: &UInt) -> Option<std::cmp::Ordering> {
265 self.0.partial_cmp(&other.0)
266 }
267}
268
269#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
270struct TwosCompInt(i128);
271
272impl FromBitField for TwosCompInt {
273 fn from_bf_be(bf: &BitField) -> TwosCompInt {
274 let mut bf = bf.clone();
275 let n = bf.len().total_bits();
276 if n > 128 {
277 bf.truncate_be(BitIndex::bits(128));
278 } else if n < 128 {
279 bf.pad_twos_compliment_be(BitIndex::bits(128));
280 }
281
282 TwosCompInt(i128::from_be_bytes(bf.into_array().unwrap()))
283 }
284
285 fn from_bf_le(bf: &BitField) -> TwosCompInt {
286 let mut bf = bf.clone();
287 let n = bf.len().total_bits();
288 if n > 128 {
289 bf.truncate_le(BitIndex::bits(128));
290 } else if n < 128 {
291 bf.pad_twos_compliment_le(BitIndex::bits(128));
292 }
293
294 TwosCompInt(i128::from_le_bytes(bf.into_array().unwrap()))
295 }
296}
297
298impl std::str::FromStr for TwosCompInt {
299 type Err = std::num::ParseIntError;
300
301 fn from_str(s: &str) -> Result<Self, Self::Err> {
302 Ok(TwosCompInt(i128::from_str(s)?))
303 }
304}
305
306impl std::cmp::PartialOrd for TwosCompInt {
307 fn partial_cmp(&self, other: &TwosCompInt) -> Option<std::cmp::Ordering> {
308 self.0.partial_cmp(&other.0)
309 }
310}
311
312#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
313struct OnesCompInt(i128);
314
315impl FromBitField for OnesCompInt {
316 fn from_bf_be(bf: &BitField) -> OnesCompInt {
317 let mut bf = bf.clone();
318 let n = bf.len().total_bits();
319 if n > 128 {
320 bf.truncate_be(BitIndex::bits(128));
321 } else if n < 128 {
322 bf.pad_twos_compliment_be(BitIndex::bits(128));
324 }
325
326 let neg = bf.convert_unsigned_be(IntFormat::OnesCompliment);
327 let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
328 if neg {
329 i *= -1
330 }
331 OnesCompInt(i)
332 }
333
334 fn from_bf_le(bf: &BitField) -> OnesCompInt {
335 let mut bf = bf.clone();
336 let n = bf.len().total_bits();
337 if n > 128 {
338 bf.truncate_le(BitIndex::bits(128));
339 } else if n < 128 {
340 bf.pad_twos_compliment_le(BitIndex::bits(128));
342 }
343
344 let neg = bf.convert_unsigned_le(IntFormat::OnesCompliment);
345 let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
346 if neg {
347 i *= -1
348 }
349 OnesCompInt(i)
350 }
351}
352
353impl std::str::FromStr for OnesCompInt {
354 type Err = std::num::ParseIntError;
355
356 fn from_str(s: &str) -> Result<Self, Self::Err> {
357 Ok(OnesCompInt(i128::from_str(s)?))
358 }
359}
360
361impl std::cmp::PartialOrd for OnesCompInt {
362 fn partial_cmp(&self, other: &OnesCompInt) -> Option<std::cmp::Ordering> {
363 self.0.partial_cmp(&other.0)
364 }
365}
366
367#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
368struct SignMagInt(i128);
369
370impl FromBitField for SignMagInt {
371 fn from_bf_be(bf: &BitField) -> SignMagInt {
372 let mut bf = bf.clone();
373 let n = bf.len().total_bits();
374 if n > 128 {
375 bf.truncate_be(BitIndex::bits(128));
376 } else if n < 128 {
377 bf.pad_sign_magnitude_be(BitIndex::bits(128));
378 }
379
380 let neg = bf.convert_unsigned_be(IntFormat::SignMagnitude);
381 let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
382 if neg {
383 i *= -1
384 }
385 SignMagInt(i)
386 }
387
388 fn from_bf_le(bf: &BitField) -> SignMagInt {
389 let mut bf = bf.clone();
390 let n = bf.len().total_bits();
391 if n > 128 {
392 bf.truncate_le(BitIndex::bits(128));
393 } else if n < 128 {
394 bf.pad_sign_magnitude_le(BitIndex::bits(128));
395 }
396
397 let neg = bf.convert_unsigned_le(IntFormat::SignMagnitude);
398 let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
399 if neg {
400 i *= -1
401 }
402 SignMagInt(i)
403 }
404}
405
406impl std::str::FromStr for SignMagInt {
407 type Err = std::num::ParseIntError;
408
409 fn from_str(s: &str) -> Result<Self, Self::Err> {
410 Ok(SignMagInt(i128::from_str(s)?))
411 }
412}
413
414impl std::cmp::PartialOrd for SignMagInt {
415 fn partial_cmp(&self, other: &SignMagInt) -> Option<std::cmp::Ordering> {
416 self.0.partial_cmp(&other.0)
417 }
418}
419
420#[derive(PartialEq, Eq, Ord, std::hash::Hash, Clone, Debug)]
421struct NegaInt(i128);
422
423impl FromBitField for NegaInt {
424 fn from_bf_be(bf: &BitField) -> NegaInt {
425 let mut bf = bf.clone();
426 let n = bf.len().total_bits();
427 if n > 128 {
428 bf.truncate_be(BitIndex::bits(128));
429 } else if n < 128 {
430 bf.pad_unsigned_be(BitIndex::bits(128));
432 }
433
434 let neg = bf.convert_unsigned_be(IntFormat::BaseMinusTwo);
435 let mut i = u128::from_be_bytes(bf.into_array().unwrap()) as i128;
436 if neg {
437 i *= -1
438 }
439 NegaInt(i)
440 }
441
442 fn from_bf_le(bf: &BitField) -> NegaInt {
443 let mut bf = bf.clone();
444 let n = bf.len().total_bits();
445 if n > 128 {
446 bf.truncate_le(BitIndex::bits(128));
447 } else if n < 128 {
448 bf.pad_unsigned_le(BitIndex::bits(128));
450 }
451
452 let neg = bf.convert_unsigned_le(IntFormat::BaseMinusTwo);
453 let mut i = u128::from_le_bytes(bf.into_array().unwrap()) as i128;
454 if neg {
455 i *= -1
456 }
457 NegaInt(i)
458 }
459}
460
461impl std::str::FromStr for NegaInt {
462 type Err = std::num::ParseIntError;
463
464 fn from_str(s: &str) -> Result<Self, Self::Err> {
465 Ok(NegaInt(i128::from_str(s)?))
466 }
467}
468
469impl std::cmp::PartialOrd for NegaInt {
470 fn partial_cmp(&self, other: &NegaInt) -> Option<std::cmp::Ordering> {
471 self.0.partial_cmp(&other.0)
472 }
473}
474
475#[derive(Clone, Debug)]
476struct Float32(f32);
477impl Float32 {
478 fn canonical_form(&self) -> u32 {
479 let mut u = self.0.clone().to_bits();
480
481 if u & 0x7F800000 == 0x7F800000 {
483 u &= 0xFFC00000;
487 u |= 0x00000001;
490
491 }
492 u
493 }
494}
495impl FromBitField for Float32 {
496 fn from_bf_be(bf: &BitField) -> Float32 {
497 let mut bf = bf.clone();
498 let n = bf.len().total_bits();
499 if n > 32 {
500 bf.truncate_be(BitIndex::bits(32));
501 } else if n < 32 {
502 panic!("Float 32 cannot be formed from a BitField with length {}", bf.len())
503 }
504
505 Float32(f32::from_be_bytes(bf.into_array().unwrap()))
506 }
507
508 fn from_bf_le(bf: &BitField) -> Float32 {
509 let mut bf = bf.clone();
510 let n = bf.len().total_bits();
511 if n > 32 {
512 bf.truncate_le(BitIndex::bits(32));
513 } else if n < 32 {
514 panic!("Float 32 cannot be formed from a BitField with length {}", bf.len())
515 }
516
517 Float32(f32::from_le_bytes(bf.into_array().unwrap()))
518 }
519}
520
521
522impl std::hash::Hash for Float32 {
523 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
524 self.canonical_form().hash(state);
525 }
526}
527
528impl PartialEq for Float32 {
529 fn eq(&self, other: &Float32) -> bool {
530 self.canonical_form() == other.canonical_form()
531 }
532}
533
534impl Eq for Float32 {}
535
536impl std::str::FromStr for Float32 {
537 type Err = std::num::ParseFloatError;
538
539 fn from_str(s: &str) -> Result<Self, Self::Err> {
540 match s.to_lowercase().as_str() {
541 "snan" | "+snan" => {
542 Ok(Float32(f32::from_bits(0x7F800001)))
544 },
545 "-snan" => {
546 Ok(Float32(f32::from_bits(0xFF800001)))
548 },
549 "qnan" | "+qnan" => {
550 Ok(Float32(f32::from_bits(0x7FC00001)))
552 },
553 "-qnan" => {
554 Ok(Float32(f32::from_bits(0xFFC00001)))
556 },
557 _ => Ok(Float32(f32::from_str(s)?))
558 }
559
560 }
561}
562
563impl std::cmp::PartialOrd for Float32 {
564 fn partial_cmp(&self, other: &Float32) -> Option<std::cmp::Ordering> {
565 self.0.partial_cmp(&other.0)
566 }
567}
568
569use crate::Semb;
570#[derive(Clone, Debug)]
571struct SembFloat<const S: usize, const E: usize, const M: usize, const B: i32>(Semb<S, E, M, B>);
572
573impl<const S: usize, const E: usize, const M: usize, const B: i32> FromBitField for SembFloat<S, E, M, B> {
574 fn from_bf_be(bf: &BitField) -> SembFloat<S, E, M, B> {
575 SembFloat(Semb::from_bf_be(bf))
576 }
577
578 fn from_bf_le(bf: &BitField) -> SembFloat<S, E, M, B> {
579 SembFloat(Semb::from_bf_le(bf))
580 }
581}
582
583impl<const S: usize, const E: usize, const M: usize, const B: i32> std::hash::Hash for SembFloat<S, E, M, B> {
584 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
585 self.0.canonical_form().hash(state);
586 }
587}
588
589impl<const S: usize, const E: usize, const M: usize, const B: i32> PartialEq for SembFloat<S, E, M, B> {
590 fn eq(&self, other: &SembFloat<S, E, M, B>) -> bool {
591 self.0.canonical_form() == other.0.canonical_form()
592 }
593}
594
595impl<const S: usize, const E: usize, const M: usize, const B: i32> std::cmp::Eq for SembFloat<S, E, M, B> {}
596
597
598impl<const S: usize, const E: usize, const M: usize, const B: i32> std::str::FromStr for SembFloat<S, E, M, B> {
599 type Err = crate::semb::ParseSembError;
600
601 fn from_str(s: &str) -> Result<Self, Self::Err> {
602 Ok(SembFloat(Semb::from_str(s)?))
603
604 }
605}
606
607impl<const S: usize, const E: usize, const M: usize, const B: i32> std::cmp::PartialOrd for SembFloat<S, E, M, B> {
608 fn partial_cmp(&self, other: &SembFloat<S, E, M, B>) -> Option<std::cmp::Ordering> {
609 self.0.partial_cmp(&other.0)
610 }
611}
612
613#[derive(PartialEq, Eq, Debug)]
614enum Endian {
615 Big,
616 Little
617}
618
619#[derive(PartialEq, Debug)]
620struct CharClass<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::cmp::Eq> {
621 inverted: bool,
622 endian: Endian,
623 options: HashSet<T>,
624 ranges: Vec<(T, T)>,
625 input_length: BitIndex
626}
627
628impl<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::cmp::Eq + std::fmt::Debug> CharClass<T> {
629 fn input_length(&self) -> BitIndex {
630 self.input_length
631 }
632
633 fn matches(&self, input: &BitField) -> bool {
634 let x: T = match self.endian {
635 Endian::Big => T::from_bf_be(input),
636 Endian::Little => T::from_bf_le(input)
637 };
638 if self.options.contains(&x) {
640 return !self.inverted
641 }
642 for (start, end) in &self.ranges {
643 if *start <= x && x <= *end {
644 return !self.inverted
645 }
646 }
647 return self.inverted
648 }
649}
650
651fn parse_ascii_char_class(input: &Vec<char>, endian: Endian) -> Result<CharClass<UInt>, String> {
652 let mut input_iter = escape_vec(input).peekable();
653 let mut options = HashSet::<UInt>::new();
654 let mut ranges = Vec::<(UInt, UInt)>::new();
655 let mut prev_char: Option<UInt> = None;
656 let mut range_started = false;
658
659 let inverted = if input_iter.peek() == Some(&('^', false)) {
660 input_iter.next();
661 true
662 } else {
663 false
664 };
665
666 for (c, escaped) in input_iter {
667 match c {
668 '-' if !escaped => {
669 match prev_char {
670 Some(_) if range_started => return Err("Unexpected '-' encountered after '-'".to_string()),
671 Some(_) => {
672 range_started = true;
673 continue
674 },
675 None => return Err("Unexpected '-' encountered at beginning of character class".to_string()),
676 }
677 },
678 _ if range_started => {
679 match prev_char {
680 Some(ref p) => {
681 ranges.push((p.clone(), UInt(c as u128)));
682 prev_char = None;
683 },
684 None => return Err("Impossible state".to_string())
685 }
686 },
687 _ => {
688 match prev_char {
689 Some(ref p) => {
690 options.insert(p.clone());
691 },
692 None => ()
693 }
694 prev_char = Some(UInt(c as u128));
695 }
696 }
697 range_started = false;
698
699 }
700 match prev_char {
701 Some(ref p) => {
702 options.insert(p.clone());
703 },
704 None => ()
705 }
706 Ok(CharClass::<UInt> {inverted, endian, options, ranges, input_length: BitIndex::new(1, 0)})
707}
708
709enum RangeParseProgress<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + std::cmp::Eq> {
710 NotStarted,
711 OneDot(T),
712 TwoDot(T)
713}
714
715fn parse_int_char_class<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + Eq>(input: &Vec<char>, nbits: usize, endian: Endian) -> Result<CharClass<T>, String>
716where <T as std::str::FromStr>::Err: std::fmt::Display {
717 let input_length = bx!(,nbits);
718 let mut input_iter = escape_vec(input).peekable();
719 let mut options = HashSet::<T>::new();
720 let mut ranges = Vec::<(T, T)>::new();
721 let mut prev_word = Vec::<char>::new();
722 let mut range_prog = RangeParseProgress::<T>::NotStarted;
723
724 let inverted = if input_iter.peek() == Some(&('^', false)) {
725 input_iter.next();
726 true
727 } else {
728 false
729 };
730
731 for (c, escaped) in input_iter {
732 if escaped {
733 return Err("Escaped characters invalid in decimal character class".to_string());
734 }
735 match c {
736 ',' => {
737 match prev_word.iter().collect::<String>().parse::<T>() {
738 Ok(n) => {
739 match range_prog {
740 RangeParseProgress::NotStarted => {
741 options.insert(n);
742 },
743 RangeParseProgress::TwoDot(n0) => {
744 ranges.push((n0, n));
745 range_prog = RangeParseProgress::NotStarted;
746 },
747 _ => return Err(format!("Encountered unexpected character '{}' while parsing range", c))
748 }
749 },
750 Err(msg) => return Err(msg.to_string())
751 }
752 prev_word.clear();
753 },
754 '.' => {
755 match range_prog {
756 RangeParseProgress::NotStarted => {
757 match prev_word.iter().collect::<String>().parse::<T>() {
758 Ok(n) => {
759 range_prog = RangeParseProgress::OneDot(n);
760 prev_word.clear();
761 },
762 Err(msg) => return Err(msg.to_string())
763 }
764 },
765 RangeParseProgress::OneDot(n0) => {
766 if !prev_word.is_empty() {
767 return Err("Impossible State".to_string())
768 }
769 range_prog = RangeParseProgress::TwoDot(n0);
770 },
771 RangeParseProgress::TwoDot(_) => {
772 return Err(format!("Encountered unexpected character '{}' while parsing range", c))
773 }
774 }
775 },
776 '-' | '+' | '0'..='9' => {
777 match range_prog {
778 RangeParseProgress::OneDot(_) => {
779 return Err(format!("Encountered unexpected character '{}' while parsing range", c))
780 },
781 _ => ()
782 }
783 prev_word.push(c);
784 },
785 _ => return Err(format!("Encountered unexpected character '{}' while parsing decimal character class", c))
786 }
787
788 }
789
790 if prev_word.is_empty() {
791 return Err("Encountered end of character class before expected".to_string())
792 }
793 match prev_word.iter().collect::<String>().parse::<T>() {
794 Ok(n) => {
795 match range_prog {
796 RangeParseProgress::NotStarted => {
797 options.insert(n);
798 },
799 RangeParseProgress::TwoDot(n0) => {
800 ranges.push((n0, n));
801 },
802 _ => return Err("Encountered end of character class before expected".to_string())
803 }
804 },
805 Err(msg) => return Err(msg.to_string())
806 }
807
808 Ok(CharClass::<T> {inverted, endian, options, ranges, input_length})
809}
810
811fn parse_num_char_class<T: std::str::FromStr + PartialOrd + std::hash::Hash + FromBitField + std::fmt::Debug + Eq>(input: &Vec<char>, nbits: usize, endian: Endian) -> Result<CharClass<T>, String>
812where <T as std::str::FromStr>::Err: std::fmt::Display {
813 let input_length = bx!(,nbits);
814 let mut input_iter = input.into_iter().peekable();
815 let mut options = HashSet::<T>::new();
816 let mut ranges = Vec::<(T, T)>::new();
817
818 let inverted = if input_iter.peek() == Some(&&'^') {
819 input_iter.next();
820 true
821 } else {
822 false
823 };
824
825 let body: String = input_iter.collect();
826
827 for word in body.split(",") {
828 match word.split_once("..") {
829 Some((lhs, rhs)) => {
830 let lhs = match lhs.parse::<T>() {
831 Ok(n) => n,
832 Err(err) => return Err(err.to_string())
833 };
834 let rhs = match rhs.parse::<T>() {
835 Ok(n) => n,
836 Err(err) => return Err(err.to_string())
837 };
838 ranges.push((lhs, rhs));
839 },
840 None => {
841 let value = match word.parse::<T>() {
842 Ok(n) => n,
843 Err(err) => return Err(err.to_string())
844 };
845 options.insert(value);
846 }
847 }
848 }
849
850 Ok(CharClass::<T> {inverted, endian, options, ranges, input_length})
851}
852
853fn parse_char_class(input: &Vec<char>) -> Result<Token, String> {
854 let mut from_start = Vec::<char>::new();
855 for (i, c) in input.iter().enumerate() {
856 match c {
857 '<' | '>' | '0'..='9'|'a'..='z'|'A'..='Z' => {
858 from_start.push(*c);
859 },
860 ':' => {
861 let mut char_class_type_index = 0;
862 let endian = match from_start[0] {
863 '<' => {
864 char_class_type_index += 1;
865 Endian::Little
866 },
867 '>' => {
868 char_class_type_index += 1;
869 Endian::Big
870 },
871 _ => Endian::Big
872 };
873
874 match from_start[(char_class_type_index + 1)..].iter().collect::<String>().parse::<usize>() {
875 Ok(n) => {
876 match from_start[char_class_type_index] {
877 'a' => {
878 let cls_result = parse_ascii_char_class(&input[i+1..].to_vec(), endian);
879 match cls_result {
880 Ok(cls) => {
881 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
882 },
883 Err(msg) => return Err(msg)
884 }
885 },
886 'u' => {
887 let cls_result = parse_num_char_class::<UInt>(&input[i+1..].to_vec(), n, endian);
888 match cls_result {
889 Ok(cls) => {
890 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
891 },
892 Err(msg) => return Err(msg)
893 }
894 },
895 'i' => {
896 let cls_result = parse_num_char_class::<TwosCompInt>(&input[i+1..].to_vec(), n, endian);
897 match cls_result {
898 Ok(cls) => {
899 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::TwosCompInt(cls))))
900 },
901 Err(msg) => return Err(msg)
902 }
903 },
904 'o' => {
905 let cls_result = parse_num_char_class::<OnesCompInt>(&input[i+1..].to_vec(), n, endian);
906 match cls_result {
907 Ok(cls) => {
908 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::OnesCompInt(cls))))
909 },
910 Err(msg) => return Err(msg)
911 }
912 },
913 's' => {
914 let cls_result = parse_num_char_class::<SignMagInt>(&input[i+1..].to_vec(), n, endian);
915 match cls_result {
916 Ok(cls) => {
917 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::SignMagInt(cls))))
918 },
919 Err(msg) => return Err(msg)
920 }
921 },
922 'n' => {
923 let cls_result = parse_num_char_class::<NegaInt>(&input[i+1..].to_vec(), n, endian);
924 match cls_result {
925 Ok(cls) => {
926 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::NegaInt(cls))))
927 },
928 Err(msg) => return Err(msg)
929 }
930 },
931 'f' => {
932 match n {
933 16 => {
934 let cls_result = parse_num_char_class::<SembFloat<1, 5, 10, 15>>(&input[i+1..].to_vec(), n, endian);
935 match cls_result {
936 Ok(cls) => {
937 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float16(cls))))
938 },
939 Err(msg) => return Err(msg)
940 }
941 },
942 32 => {
943 let cls_result = parse_num_char_class::<SembFloat<1, 8, 23, 127>>(&input[i+1..].to_vec(), n, endian);
944 match cls_result {
945 Ok(cls) => {
946 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float32(cls))))
947 },
948 Err(msg) => return Err(msg)
949 }
950 },
951 64 => {
952 let cls_result = parse_num_char_class::<SembFloat<1, 11, 52, 1023>>(&input[i+1..].to_vec(), n, endian);
953 match cls_result {
954 Ok(cls) => {
955 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float64(cls))))
956 },
957 Err(msg) => return Err(msg)
958 }
959 },
960 128 => {
961 let cls_result = parse_num_char_class::<SembFloat<1, 15, 112, 16383>>(&input[i+1..].to_vec(), n, endian);
962 match cls_result {
963 Ok(cls) => {
964 return Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::Float128(cls))))
965 },
966 Err(msg) => return Err(msg)
967 }
968 },
969 _ => {
970 return Err(format!("{}-bit float format not recognized", n))
971 }
972 }
973
974 },
975 _ => return Err(format!("Character class type '{}' not recognized", from_start.iter().collect::<String>()))
976 }
977 },
978 Err(msg) => return Err(msg.to_string())
979 }
980 },
981 _ => break
982 }
983 }
984 match parse_ascii_char_class(input, Endian::Big) {
985 Ok(cls) => {
986 Ok(Token::CharacterClass(std::rc::Rc::new(DynamicCharacterClass::UInt(cls))))
987 },
988 Err(msg) => Err(msg)
989 }
990}
991
992fn compile(input: &Vec<Token>, start_index: usize) -> Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>> {
993 let mut result = Vec::<Vec<(usize, StateTransition, Vec<StateFlag>)>>::new();
994 let mut i = start_index;
995 for token in input {
996 match token {
997 Token::Repetition(Repeat::Any(greediness), inside) => {
998 let inside_result = compile(&vec![*inside.clone()], i + 1);
999 let inside_len = inside_result.len();
1000 match greediness {
1001 Greediness::Greedy => {
1002 result.push(vec![(i + 1, StateTransition::Free, vec![]), (i + inside_len + 2, StateTransition::Free, vec![])]);
1003 result.extend(inside_result);
1004 result.push(vec![(i + 1, StateTransition::Free, vec![]), (i + inside_len + 2, StateTransition::Free, vec![])]);
1005 },
1006 Greediness::Lazy => {
1007 result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![]), (i + 1, StateTransition::Free, vec![])]);
1008 result.extend(inside_result);
1009 result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![]), (i + 1, StateTransition::Free, vec![])]);
1010 }
1011 }
1012
1013 i += inside_len + 2;
1014 },
1015 Token::Repetition(Repeat::Exactly(n), inside) => {
1016 for _ in 0..*n {
1017 let inside_result = compile(&vec![*inside.clone()], i);
1018 i += inside_result.len();
1019 result.extend(inside_result);
1020 }
1021 },
1022 Token::Repetition(Repeat::LessThan(n, greediness), inside) => {
1023 let start_i = i - start_index;
1024 result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1025 i += 1;
1026 for _ in 0..(*n - 1) {
1027 result[start_i].push((i, StateTransition::Free, vec![]));
1028 let inside_result = compile(&vec![*inside.clone()], i);
1029 i += inside_result.len();
1030 result.extend(inside_result);
1031 }
1032 result[start_i].push((i, StateTransition::Free, vec![]));
1033 if matches!(greediness, Greediness::Lazy) {
1034 result[start_i].reverse();
1035 }
1036 },
1037 Token::Choice(inside) => {
1038 let mut finish_indices = Vec::<usize>::new();
1039 let start_i = i - start_index;
1040 result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1041 i += 1;
1042 for tok in inside {
1043 result[start_i].push((i, StateTransition::Free, vec![]));
1044 let inside_result = compile(&vec![tok.clone()], i);
1045 i += inside_result.len();
1046 result.extend(inside_result);
1047 result.push(Vec::<(usize, StateTransition, Vec<StateFlag>)>::new());
1048 finish_indices.push(i - start_index);
1049 i += 1;
1050 }
1051 for finish_i in finish_indices {
1052 result[finish_i].push((i, StateTransition::Free, vec![]));
1053 }
1054
1055 },
1056 Token::Group(Some(group_index), inside) => {
1057 let inside_result = compile(inside, i + 1);
1058 let inside_len = inside_result.len();
1059 result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::GroupStart(*group_index)])]);
1060 result.extend(inside_result);
1061 result.push(vec![(i + inside_len + 2, StateTransition::Free, vec![StateFlag::GroupEnd(*group_index)])]);
1062 i += inside_len + 2;
1063 },
1064 Token::Group(None, inside) => {
1065 let inside_result = compile(inside, i);
1066 let inside_len = inside_result.len();
1067 result.extend(inside_result);
1068 i += inside_len;
1069 },
1070 Token::CharacterClass(cls) => {
1071 result.push(vec![(i + 1, StateTransition::CharacterClass(cls.clone()), vec![])]);
1072 i += 1;
1073 }
1074 Token::BitZero => {
1075 result.push(vec![(i + 1, StateTransition::EqualsBit(0), vec![])]);
1076 i += 1;
1077 },
1078 Token::BitOne => {
1079 result.push(vec![(i + 1, StateTransition::EqualsBit(1), vec![])]);
1080 i += 1;
1081 },
1082 Token::BitAny => {
1083 result.push(vec![(i + 1, StateTransition::ConsumeBits(1), vec![])]);
1084 i += 1;
1085 },
1086 Token::Nibble(n) => {
1087 result.push(vec![(i + 1, StateTransition::EqualsNibble(*n), vec![])]);
1088 i += 1;
1089 }
1090 Token::ByteAny => {
1091 result.push(vec![(i + 1, StateTransition::ConsumeBits(8), vec![])]);
1092 i += 1;
1093 },
1094 Token::ByteBoundary => {
1095 result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::NotOffsetAnd(0x07)])]);
1096 i += 1;
1097 },
1098 Token::NibbleBoundary => {
1099 result.push(vec![(i + 1, StateTransition::Free, vec![StateFlag::NotOffsetAnd(0x03)])]);
1100 i += 1;
1101 }
1102 }
1103 }
1104 result
1105}
1106
1107fn normalize_index(removed: &Vec<usize>, index: usize) -> usize {
1108 let mut result = index;
1109 for r in removed {
1110 if *r < index {
1111 result -= 1;
1112 }
1113 }
1114 result
1115}
1116
1117fn get_direct_transitions(fsm: &Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, state: usize) -> Vec<(usize, StateTransition, Vec<StateFlag>)> {
1118 let mut result = Vec::<(usize, StateTransition, Vec<StateFlag>)>::new();
1119 let mut explorer_state = state;
1120 let mut path = Vec::<(usize, usize)>::new(); let mut sum_flags = Vec::<Vec<StateFlag>>::new();
1122 let mut t_index = 0;
1123 let final_depth = fsm.len();
1124 loop {
1126 let mut transitions = &fsm[explorer_state];
1127 while t_index >= transitions.len() { sum_flags.pop();
1130 match path.pop() {
1131 Some((dest, i)) => {
1132 explorer_state = dest;
1134 transitions = &fsm[explorer_state];
1135 t_index = i;
1136 },
1137 None => return result }
1139 }
1140 match &transitions[t_index] {
1141 (dest, StateTransition::Free, flags) if *dest != final_depth => { if path.contains(&(explorer_state, t_index + 1)) {
1144 t_index += 1;
1145 continue }
1147 path.push((explorer_state, t_index + 1));
1148 sum_flags.push(flags.to_vec());
1149 explorer_state = *dest;
1150 t_index = 0;
1151 continue
1152 },
1153 (dest, t, final_flags) => { let mut new_flags = Vec::<StateFlag>::new();
1156 for flags in &sum_flags {
1157 new_flags.extend(flags.to_vec());
1158 }
1159 new_flags.extend(final_flags.to_vec());
1160 result.push((*dest, t.clone(), new_flags));
1161 t_index += 1;
1162 }
1163 }
1164 }
1165}
1166
1167fn remove_dead_states(fsm: &mut Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>) {
1168 let mut valid_states = HashSet::new();
1169 valid_states.insert(0);
1170 for transitions in fsm.iter() {
1171 for (dest, _t, _flags) in transitions {
1172 valid_states.insert(*dest);
1173 }
1174 }
1175 let n_states = fsm.len();
1176 let mut removed = Vec::<usize>::new();
1177 for state in 0..n_states {
1178 if !valid_states.contains(&state) {
1179 removed.push(state);
1180 }
1181 }
1182
1183 removed.reverse();
1184
1185 for r in &removed {
1186 fsm.remove(*r);
1187 }
1188
1189 for transitions in fsm.iter_mut() {
1190 for transition in transitions {
1191 transition.0 = normalize_index(&removed, transition.0);
1192 }
1193 }
1194
1195}
1196
1197fn optimize(fsm: &mut Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>) {
1198
1199 let n_states = fsm.len();
1200 for state in 0..n_states {
1201 let new_transitions = get_direct_transitions(fsm, state);
1202 fsm[state] = new_transitions;
1203 }
1204
1205 remove_dead_states(fsm);
1208
1209}
1210
1211fn tokenize(pattern: &str) -> Result<(Vec<Token>, usize, std::collections::HashMap<String, usize>), String> {
1212 let char_vec: Vec<char> = pattern.chars().collect();
1213 tokenize_vec(&char_vec, 1)
1214}
1215
1216fn parse_group_header(input: &[char]) -> Result<(GroupType, usize), String> {
1217 let mut input_iter = input.iter();
1218 match &input_iter.next() {
1219 Some('?') => {
1220 match &input_iter.next() {
1221 Some('<') => {
1222 let mut name = Vec::<char>::new();
1223 let mut offset = 2;
1224 for c in input_iter {
1225 match c {
1226 '>' => {
1227 return Ok((GroupType::Named(name.into_iter().collect::<String>()), offset + 1));
1228 },
1229 _ => name.push(*c)
1230 }
1231 offset += 1;
1232 }
1233 Err("Unterminated group name".to_string())
1234 },
1235 Some(':') => {
1236 Ok((GroupType::NonCapturing, 2))
1237 },
1238 _ => Err("'?' at beginning of group".to_string())
1239 }
1240 },
1241 _ => Ok((GroupType::Numbered, 0))
1242 }
1243}
1244
1245fn tokenize_vec(char_vec: &Vec<char>, start_group: usize) -> Result<(Vec<Token>, usize, std::collections::HashMap<String, usize>), String> {
1246 let mut result = Vec::<Token>::new();
1247 let mut i = 0;
1248 let mut option_flag = false;
1249 let mut groups_map = std::collections::HashMap::new();
1250 let mut group_index = start_group;
1251 loop {
1252 if i >= char_vec.len() {
1253 break
1254 }
1255 let result_len = result.len();
1256 match char_vec[i] {
1257 '_' => result.push(Token::BitZero),
1258 '\'' => result.push(Token::BitOne),
1259 '.' => result.push(Token::BitAny),
1260 '!' => result.push(Token::ByteAny),
1261 ':' => result.push(Token::ByteBoundary),
1262 ';' => result.push(Token::NibbleBoundary),
1263 '?' => {
1264 match result.pop() {
1265 Some(last) => {
1266 let mut greediness = Greediness::Greedy;
1267 if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1268 greediness = Greediness::Lazy;
1269 i += 1;
1270 }
1271 result.push(Token::Repetition(Repeat::LessThan(2, greediness), Box::new(last)));
1272 },
1273 None => return Err("Encountered '?' at beginning of group".to_string())
1274 }
1275 },
1276 '*' => {
1277 match result.pop() {
1278 Some(last) => {
1279 let mut greediness = Greediness::Greedy;
1280 if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1281 greediness = Greediness::Lazy;
1282 i += 1;
1283 }
1284 result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)));
1285 },
1286 None => return Err("Encountered '*' at beginning of group".to_string())
1287 }
1288 },
1289 '+' => {
1290 if result.is_empty() {
1291 return Err("Encountered '+' at beginning of group".to_string())
1292 } else {
1293 let mut greediness = Greediness::Greedy;
1294 if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1295 greediness = Greediness::Lazy;
1296 i += 1;
1297 }
1298 let last = result[result.len() - 1].clone();
1299 result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)))
1300 }
1301 },
1302 '|' => {
1303 option_flag = true;
1304 },
1305 '[' => {
1306 match find_right_delimiter(char_vec, i, '[', ']') {
1307 Some(end_index) => {
1308 match parse_char_class(&char_vec[i+1..end_index].to_vec()) {
1309 Ok(token) => result.push(token),
1310 Err(msg) => return Err(msg)
1311 }
1312 i = end_index;
1313 },
1314 None => return Err("Unclosed opening delimiter '['".to_string())
1315 }
1316 },
1317 '{' => {
1318 match find_right_delimiter(char_vec, i, '{', '}') {
1319 Some(end_index) => {
1320 let inside = &char_vec[i+1..end_index];
1321 i = end_index;
1322
1323 let mut greediness = Greediness::Greedy;
1324 if i + 1 < char_vec.len() && char_vec[i + 1] == '?' {
1325 greediness = Greediness::Lazy;
1326 i += 1;
1327 }
1328
1329 match inside.iter().position(|x| *x == ',') {
1330 Some(sep) => {
1331 if sep + 1 == inside.len() { match inside[..sep].iter().collect::<String>().parse::<usize>() {
1333 Ok(n) => {
1334 match result.pop() {
1335 Some(last) => {
1336 result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last.clone())));
1337 result.push(Token::Repetition(Repeat::Any(greediness), Box::new(last)));
1338 },
1339 None => return Err("Encountered '{' at beginning of group".to_string())
1340 }
1341 },
1342 Err(msg) => return Err(msg.to_string())
1343 }
1344 } else { match (inside[..sep].iter().collect::<String>().parse::<usize>(),
1346 inside[1+sep..].iter().collect::<String>().parse::<usize>())
1347 {
1348 (Ok(n), Ok(m)) => {
1349 match result.pop() {
1350 Some(last) => {
1351 result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last.clone())));
1352 result.push(Token::Repetition(Repeat::LessThan(m - n + 1, greediness), Box::new(last)));
1353 },
1354 None => return Err("Encountered '{' at beginning of group".to_string())
1355 }
1356 },
1357 (Err(msg), _) => return Err(msg.to_string()),
1358 (_, Err(msg)) => return Err(msg.to_string()),
1359 }
1360 }
1361 },
1362 None => { match inside.iter().collect::<String>().parse::<usize>() {
1364 Ok(n) => {
1365 match result.pop() {
1366 Some(last) => result.push(Token::Repetition(Repeat::Exactly(n), Box::new(last))),
1367 None => return Err("Encountered '{' at beginning of group".to_string())
1368 }
1369 },
1370 Err(msg) => return Err(msg.to_string())
1371 }
1372 }
1373 }
1374 },
1375 None => return Err("Unclosed opening delimiter '{'".to_string())
1376 }
1377 },
1378 '(' => {
1379 match find_right_delimiter(char_vec, i, '(', ')') {
1380 Some(end_index) => {
1381 match parse_group_header(&char_vec[i+1..end_index]) {
1382 Ok((GroupType::Numbered, start_index)) => {
1383 match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index + 1) {
1384 Ok((tokens, new_groups, new_groups_map)) => {
1385 result.push(Token::Group(Some(group_index), tokens));
1386 group_index += new_groups + 1;
1387 for (key, val) in new_groups_map.iter() {
1388 if groups_map.contains_key(key) {
1389 return Err(format!("Group name '{}' used more than once", key))
1390 } else {
1391 groups_map.insert(key.clone(), *val);
1392 }
1393 }
1394 },
1395 Err(msg) => return Err(msg)
1396 }
1397 i = end_index;
1398 },
1399 Ok((GroupType::Named(name), start_index)) => {
1400 if groups_map.contains_key(&name) {
1401 return Err(format!("Group name '{}' used more than once", name))
1402 } else {
1403 groups_map.insert(name, group_index);
1404 }
1405 match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index + 1) {
1406 Ok((tokens, new_groups, new_groups_map)) => {
1407 result.push(Token::Group(Some(group_index), tokens));
1408 group_index += new_groups + 1;
1409 for (key, val) in new_groups_map.iter() {
1410 if groups_map.contains_key(key) {
1411 return Err(format!("Group name '{}' used more than once", key))
1412 } else {
1413 groups_map.insert(key.clone(), *val);
1414 }
1415 }
1416 },
1417 Err(msg) => return Err(msg)
1418 }
1419 i = end_index;
1420 },
1421 Ok((GroupType::NonCapturing, start_index)) => {
1422 match tokenize_vec(&char_vec[i+1+start_index..end_index].to_vec(), group_index) {
1423 Ok((tokens, new_groups, new_groups_map)) => {
1424 result.push(Token::Group(None, tokens));
1425 group_index += new_groups;
1426 for (key, val) in new_groups_map.iter() {
1427 if groups_map.contains_key(key) {
1428 return Err(format!("Group name '{}' used more than once", key))
1429 } else {
1430 groups_map.insert(key.clone(), *val);
1431 }
1432 }
1433 },
1434 Err(msg) => return Err(msg)
1435 }
1436 i = end_index;
1437 },
1438 _ => todo!()
1439 }
1440
1441 },
1442 None => return Err("Unclosed opening delimiter '('".to_string())
1443 }
1444 },
1445 c => {
1446 match c {
1447 '0'..='9' => {
1448 result.push(Token::Nibble(c as u8 - 48));
1449 },
1450 'a'..='f' => {
1451 result.push(Token::Nibble(c as u8 - 87));
1452 },
1453 'A'..='F' => {
1454 result.push(Token::Nibble(c as u8 - 55));
1455 },
1456 _ => return Err(format!("Encountered unexpected character '{}'", c))
1457 }
1458 }
1459 }
1460 if option_flag && result.len() != result_len { match result.pop() {
1462 Some(last) => {
1463 match result.pop() {
1464 Some(Token::Choice(options)) => {
1465 let mut options = options.to_vec();
1466 options.push(last);
1467 result.push(Token::Choice(options));
1468 option_flag = false;
1469 },
1470 Some(token) => {
1471 result.push(Token::Choice(vec![token, last]));
1472 option_flag = false;
1473 },
1474 None => return Err("Encountered unexpected character '|'".to_string())
1475 }
1476 },
1477 None => return Err("Encountered '|' at beginning of group".to_string())
1478 }
1479 }
1480 i += 1;
1481 }
1482 Ok((result, group_index - start_group, groups_map))
1483}
1484
1485fn check_state_flags<T: BitIndexable>(input: &T, flags: &Vec<StateFlag>, offset: BitIndex) -> bool {
1486 for flag in flags {
1487 match flag {
1488 StateFlag::GroupStart(_) | StateFlag::GroupEnd(_) => (),
1489 StateFlag::NotOffsetAnd(value) => {
1490 if (offset.bit() as usize & value) != 0 {
1491 return false
1492 }
1493 }
1494 }
1495 }
1496 true
1497}
1498
1499
1500#[derive(PartialEq, Eq, Debug, Clone, Copy)]
1531pub struct BinMatch<'a, T: BitIndexable> {
1532 input: &'a T,
1533 start: BitIndex,
1534 end: BitIndex
1535}
1536
1537impl<'a, T: BitIndexable> BinMatch<'a, T> {
1538
1539 pub fn new(input: &'a T, start: BitIndex, end: BitIndex) -> BinMatch<'a, T> {
1541 BinMatch {input, start, end}
1542 }
1543
1544 pub fn start(&self) -> BitIndex {
1546 self.start
1547 }
1548
1549 pub fn end(&self) -> BitIndex {
1551 self.end
1552 }
1553
1554 pub fn is_empty(&self) -> bool {
1556 self.start == self.end
1557 }
1558
1559 pub fn span(&self) -> (BitIndex, BitIndex) {
1561 (self.start, self.end)
1562 }
1563
1564 pub fn as_bf(&self) -> BitField {
1566 self.input.bit_slice(&self.start, &self.end)
1567 }
1568}
1569
1570pub struct BinMatches<'a, T: BitIndexable> {
1574 gen: CapturePathGenerator<'a, T>,
1575 max_index: BitIndex,
1576 current_offset: BitIndex
1577}
1578
1579impl<'a, T: BitIndexable> BinMatches<'a, T> {
1580 fn new(fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, input: &'a T) -> BinMatches<'a, T> {
1581 let gen = CapturePathGenerator::new(fsm, input);
1582 let max_index = gen.input.max_index();
1583 BinMatches {gen, max_index, current_offset: BitIndex::new(0, 0)}
1584 }
1585}
1586
1587impl<'a, T: BitIndexable> std::iter::Iterator for BinMatches<'a, T> {
1588 type Item = BinMatch<'a, T>;
1589
1590 fn next(&mut self) -> Option<BinMatch<'a, T>> {
1591 while self.current_offset < self.max_index {
1592 self.gen.reset(self.current_offset);
1593 if let Some((_, initial_offset)) = self.gen.next() {
1594 let mut max_offset = initial_offset;
1595 loop {
1596 match self.gen.next() {
1597 Some((_, offset)) if offset > max_offset => {
1598 max_offset = offset;
1599 },
1600 None => {
1601 let res = Some(BinMatch::new(self.gen.input, self.current_offset, max_offset));
1602 self.current_offset = max_offset;
1603 return res
1604 },
1605 _ => ()
1606 }
1607 }
1608 }
1609 self.current_offset += 1;
1610 }
1611 None
1612 }
1613}
1614
1615pub struct BinCaptures<'a, T: BitIndexable> {
1617 groups: Vec<Option<BinMatch<'a, T>>>,
1618 groups_map: &'a std::collections::HashMap<String, usize>
1619}
1620
1621impl<'a, T: BitIndexable> BinCaptures<'a, T> {
1622
1623 pub fn get(&self, i: usize) -> Option<BinMatch<T>> {
1628 if i < self.groups.len() {
1629 match &self.groups[i] {
1630 Some(m) => Some(BinMatch::new(m.input, m.start, m.end)),
1631 None => None
1632 }
1633 } else {
1634 None
1635 }
1636 }
1637
1638 pub fn name(&self, name: &str) -> Option<BinMatch<T>> {
1642 match self.groups_map.get(&name.to_string()) {
1643 Some(index) => self.get(*index),
1644 None => None
1645 }
1646 }
1647}
1648
1649struct CapturePathGenerator<'a, T: BitIndexable> {
1650 fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>,
1651 input: &'a T,
1652 current_path: Vec::<(usize, usize, BitIndex)>,
1653 max_offset: BitIndex,
1654 current_offset: BitIndex,
1655 current_state: usize,
1656 t_index: usize
1657}
1658
1659impl<'a, T: BitIndexable> CapturePathGenerator<'a, T> {
1660
1661 fn new(fsm: &'a Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>, input: &'a T) -> CapturePathGenerator<'a, T> {
1662 CapturePathGenerator {
1663 fsm,
1664 input,
1665 current_path: Vec::<(usize, usize, BitIndex)>::new(),
1666 max_offset: input.max_index(),
1667 current_offset: BitIndex::new(0, 0),
1668 current_state: 0,
1669 t_index: 0
1670 }
1671 }
1672
1673 fn reset(&mut self, offset: BitIndex) {
1674 self.current_path = Vec::<(usize, usize, BitIndex)>::new();
1675 self.current_offset = offset;
1676 self.current_state = 0;
1677 self.t_index = 0;
1678 }
1679
1680 fn try_backtrack(&mut self) -> bool {
1681 match self.current_path.pop() {
1683 Some((dest, i, new_offset)) => {
1684 self.current_state = dest;
1685 self.t_index = i + 1;
1686 self.current_offset = new_offset;
1687 true
1688 },
1689 None => false
1690 }
1691 }
1692
1693 fn next(&mut self) -> Option<(Vec::<(usize, usize, BitIndex)>, BitIndex)> {
1694 self.try_backtrack();
1695 loop {
1696 let mut transitions = &self.fsm[self.current_state];
1697 loop {
1698
1699 while self.t_index >= transitions.len() {
1700 if !self.try_backtrack() {
1701 return None
1702 }
1703 transitions = &self.fsm[self.current_state];
1704 }
1705
1706 let (dest, transition, flags) = &transitions[self.t_index];
1707 if check_state_flags(self.input, flags, self.current_offset) {
1708 match transition {
1709 StateTransition::Free => {
1710 self.current_path.push((self.current_state, self.t_index, self.current_offset));
1711 self.current_state = *dest;
1712 break;
1713 },
1714 StateTransition::EqualsBit(value) => {
1715 if self.current_offset < self.max_offset && self.input.bit_at(&self.current_offset) == *value {
1716 self.current_path.push((self.current_state, self.t_index, self.current_offset));
1717 self.current_state = *dest;
1718 self.current_offset += 1;
1719 break;
1720 }
1721 },
1722 StateTransition::ConsumeBits(n) => {
1723 if self.current_offset + n <= self.max_offset {
1724 self.current_path.push((self.current_state, self.t_index, self.current_offset));
1725 self.current_state = *dest;
1726 self.current_offset += n;
1727 break;
1728 }
1729 },
1730 StateTransition::EqualsNibble(n) => {
1731 if self.current_offset + 3 < self.max_offset {
1732 let n2 = (self.input.bit_at(&self.current_offset) << 3) | (self.input.bit_at(&(self.current_offset + 1)) << 2) |
1733 (self.input.bit_at(&(self.current_offset + 2)) << 1) | self.input.bit_at(&(self.current_offset + 3));
1734 if *n == n2 {
1735 self.current_path.push((self.current_state, self.t_index, self.current_offset));
1736 self.current_state = *dest;
1737 self.current_offset += 4;
1738 break;
1739 }
1740 }
1741 },
1742 StateTransition::CharacterClass(cls) => {
1743 let n = cls.input_length();
1744 let end = self.current_offset + n;
1745 if end <= self.max_offset {
1746 let bf = self.input.bit_slice(&self.current_offset, &end);
1747 if cls.matches(&bf) {
1749 self.current_path.push((self.current_state, self.t_index, self.current_offset));
1750 self.current_state = *dest;
1751 self.current_offset = end;
1752 break;
1753 }
1754 }
1755 }
1756 }
1757 }
1758
1759 self.t_index += 1;
1760 }
1761 self.t_index = 0;
1762 if self.current_state >= self.fsm.len() {
1763
1764 return Some((self.current_path.clone(), self.current_offset))
1765 }
1766
1767 }
1768 }
1769}
1770
1771pub struct BinRegex {
1774 fsm: Vec<Vec<(usize, StateTransition, Vec<StateFlag>)>>,
1775 pub n_groups: usize,
1776 groups_map: std::collections::HashMap<String, usize>
1777}
1778
1779impl BinRegex {
1780
1781 pub fn new(pattern: &str) -> Result<BinRegex, String>{
1784 let (tokens, n_groups, groups_map) = tokenize(pattern)?;
1785 let n_groups = n_groups + 1;
1786 let mut fsm = compile(&tokens, 0);
1787 optimize(&mut fsm);
1788 Ok(BinRegex {fsm, n_groups, groups_map})
1790 }
1791
1792
1793 fn match_path<'a, T>(&self, input: &'a T) -> Option<(Vec::<(usize, usize, BitIndex)>, BitIndex)>
1794 where &'a T: BitIndexable {
1795 let mut gen = CapturePathGenerator::new(&self.fsm, &input);
1796 match gen.next() {
1797 Some((path, offset)) => {
1798 let mut max_offset = offset;
1799 let mut current_path = path;
1800 loop {
1801 match gen.next() {
1802 Some((path, offset)) if offset > max_offset => {
1803 max_offset = offset;
1804 current_path = path
1805 },
1806 None => return Some((current_path, max_offset)),
1807 _ => ()
1808 }
1809 }
1810 },
1811 None => None
1812 }
1813 }
1814
1815 pub fn find<'a, T>(&self, input: &'a T) -> Option<BinMatch<'a, T>>
1832 where &'a T: BitIndexable, T: BitIndexable {
1833 let mut gen = CapturePathGenerator::new(&self.fsm, input);
1834
1835 let mut i = BitIndex::new(0, 0);
1836 let max_index = input.max_index();
1837 while i < max_index {
1838 gen.reset(i);
1839 if let Some((_, initial_offset)) = gen.next() {
1840 let mut max_offset = initial_offset;
1841 loop {
1842 match gen.next() {
1843 Some((_, offset)) if offset > max_offset => {
1844 max_offset = offset;
1845 },
1846 None => return Some(BinMatch::new(input, i, max_offset)),
1847 _ => ()
1848 }
1849 }
1850 }
1851 i += 1;
1852 }
1853 None
1854 }
1855
1856 pub fn find_iter<'a, T>(&'a self, input: &'a T) -> BinMatches<'a, T>
1858 where &'a T: BitIndexable, T: BitIndexable {
1859 BinMatches::new(&self.fsm, input)
1860
1861 }
1862
1863 pub fn match_length<'a, T>(&'a self, input: &'a T) -> Option<BitIndex>
1864 where &'a T: BitIndexable {
1865 match self.match_path(input) {
1866 Some((_, offset)) => Some(offset),
1867 None => None
1868 }
1869 }
1870
1871 pub fn captures<'a, T>(&'a self, input: &'a T) -> Option<BinCaptures<T>>
1875 where &'a T: BitIndexable, T: BitIndexable {
1876 match self.match_path(input) {
1877 Some((mut path, offset)) => {
1878 let mut groups = Vec::<Option<BinMatch<T>>>::new();
1880 for _ in 0..self.n_groups {
1881 groups.push(None);
1882 }
1883
1884 let mut group_ends = std::collections::HashMap::new();
1885 path.reverse();
1886 for (state, t_index, offset) in path {
1887 let mut flags = self.fsm[state][t_index].2.to_vec();
1888 flags.reverse();
1890 for flag in flags {
1891 match flag {
1892 StateFlag::GroupEnd(group_index) => {
1893 group_ends.insert(group_index, offset);
1894 },
1895 StateFlag::GroupStart(group_index) => {
1896 match &groups[group_index] {
1897 None =>
1898 match group_ends.get(&group_index) {
1899 Some(end_index) => groups[group_index] = Some(BinMatch::new(input, offset, *end_index)),
1900 None => panic!("Group start with no end")
1901 },
1902 Some(bin_match) if bin_match.is_empty() => { match group_ends.get(&group_index) {
1904 Some(end_index) if *end_index != offset => groups[group_index] = Some(BinMatch::new(input, offset, *end_index)),
1905 _ => ()
1906 }
1907 },
1908 _ => ()
1909 }
1910 },
1911 _ => ()
1912 }
1913 }
1914 }
1915 groups[0] = Some(BinMatch::new(input, BitIndex::new(0, 0), offset));
1916 Some(BinCaptures {groups, groups_map: &self.groups_map})
1917 }
1918 None => None
1919 }
1920 }
1921}
1922
1923#[cfg(test)]
1924mod tests {
1925 use super::*;
1926
1927 #[test]
1928 fn uint_be_char_class() {
1929 let input = "u5:0,1,23".chars().collect();
1930 let res = parse_char_class(&input).unwrap();
1931 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1932 inverted: false,
1933 endian: Endian::Big,
1934 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1935 ranges: vec![],
1936 input_length: bx!(,5)
1937 });
1938 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1939
1940 let input = ">u6:^0,1,23".chars().collect();
1941 let res = parse_char_class(&input).unwrap();
1942 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1943 inverted: true,
1944 endian: Endian::Big,
1945 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1946 ranges: vec![],
1947 input_length: bx!(,6)
1948 });
1949 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1950
1951 let input = "u8:^0,50..64,1,23,100..128".chars().collect();
1952 let res = parse_char_class(&input).unwrap();
1953 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1954 inverted: true,
1955 endian: Endian::Big,
1956 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1957 ranges: vec![(UInt(50),UInt(64)), (UInt(100), UInt(128))],
1958 input_length: bx!(,8)
1959 });
1960 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1961
1962 let input = ">u120:^0,51230..641234,1,23000,100..128".chars().collect();
1963 let res = parse_char_class(&input).unwrap();
1964 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1965 inverted: true,
1966 endian: Endian::Big,
1967 options: HashSet::from([UInt(0), UInt(1), UInt(23000)]),
1968 ranges: vec![(UInt(51230),UInt(641234)), (UInt(100), UInt(128))],
1969 input_length: bx!(,120)
1970 });
1971 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1972 }
1973
1974 #[test]
1975 fn uint_le_char_class() {
1976 let input = "<u5:0,1,23".chars().collect();
1977 let res = parse_char_class(&input).unwrap();
1978 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1979 inverted: false,
1980 endian: Endian::Little,
1981 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1982 ranges: vec![],
1983 input_length: bx!(,5)
1984 });
1985 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1986
1987 let input = "<u6:^0,1,23".chars().collect();
1988 let res = parse_char_class(&input).unwrap();
1989 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
1990 inverted: true,
1991 endian: Endian::Little,
1992 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
1993 ranges: vec![],
1994 input_length: bx!(,6)
1995 });
1996 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
1997
1998 let input = "<u8:^0,50..64,1,23,100..128".chars().collect();
1999 let res = parse_char_class(&input).unwrap();
2000 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2001 inverted: true,
2002 endian: Endian::Little,
2003 options: HashSet::from([UInt(0), UInt(1), UInt(23)]),
2004 ranges: vec![(UInt(50),UInt(64)), (UInt(100), UInt(128))],
2005 input_length: bx!(,8)
2006 });
2007 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2008
2009 let input = "<u120:^0,51230..641234,1,23000,100..128".chars().collect();
2010 let res = parse_char_class(&input).unwrap();
2011 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2012 inverted: true,
2013 endian: Endian::Little,
2014 options: HashSet::from([UInt(0), UInt(1), UInt(23000)]),
2015 ranges: vec![(UInt(51230),UInt(641234)), (UInt(100), UInt(128))],
2016 input_length: bx!(,120)
2017 });
2018 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2019 }
2020
2021 #[test]
2022 fn twos_compliment_be_char_class() {
2023 let input = "i5:0,-1,+23".chars().collect();
2024 let res = parse_char_class(&input).unwrap();
2025 let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2026 inverted: false,
2027 endian: Endian::Big,
2028 options: HashSet::from([TwosCompInt(0), TwosCompInt(-1), TwosCompInt(23)]),
2029 ranges: vec![],
2030 input_length: bx!(,5)
2031 });
2032 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2033
2034 let input = ">i6:^+0,1,-23".chars().collect();
2035 let res = parse_char_class(&input).unwrap();
2036 let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2037 inverted: true,
2038 endian: Endian::Big,
2039 options: HashSet::from([TwosCompInt(0), TwosCompInt(1), TwosCompInt(-23)]),
2040 ranges: vec![],
2041 input_length: bx!(,6)
2042 });
2043 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2044
2045 let input = "i8:^-0,-50..+64,-1,+23,-130..-100".chars().collect();
2046 let res = parse_char_class(&input).unwrap();
2047 let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2048 inverted: true,
2049 endian: Endian::Big,
2050 options: HashSet::from([TwosCompInt(0), TwosCompInt(-1), TwosCompInt(23)]),
2051 ranges: vec![(TwosCompInt(-50), TwosCompInt(64)), (TwosCompInt(-130), TwosCompInt(-100))],
2052 input_length: bx!(,8)
2053 });
2054 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2055
2056 let input = ">i120:^+0,-521230..-641234,1,-23000,+100..128".chars().collect();
2057 let res = parse_char_class(&input).unwrap();
2058 let cls = DynamicCharacterClass::TwosCompInt(CharClass::<TwosCompInt>{
2059 inverted: true,
2060 endian: Endian::Big,
2061 options: HashSet::from([TwosCompInt(0), TwosCompInt(1), TwosCompInt(-23000)]),
2062 ranges: vec![(TwosCompInt(-521230), TwosCompInt(-641234)), (TwosCompInt(100), TwosCompInt(128))],
2063 input_length: bx!(,120)
2064 });
2065 assert_eq!(res, Token::CharacterClass(std::rc::Rc::new(cls)));
2066 }
2067
2068 #[test]
2069 fn ascii_char_class() {
2070 let input = "abc".chars().collect();
2071 let res = parse_char_class(&input);
2072 let mut options = HashSet::new();
2073 options.insert(UInt(97));
2074 options.insert(UInt(98));
2075 options.insert(UInt(99));
2076 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2077 inverted: false,
2078 endian: Endian::Big,
2079 options,
2080 ranges: vec![],
2081 input_length: bx!(,8)
2082 });
2083 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2084
2085 let input = "a8:^abc".chars().collect();
2086 let res = parse_char_class(&input);
2087 let mut options = HashSet::new();
2088 options.insert(UInt(97));
2089 options.insert(UInt(98));
2090 options.insert(UInt(99));
2091 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2092 inverted: true,
2093 endian: Endian::Big,
2094 options,
2095 ranges: vec![],
2096 input_length: bx!(,8)
2097 });
2098 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2099
2100 let input = "\\\\a\\bc".chars().collect();
2101 let res = parse_char_class(&input);
2102 let mut options = HashSet::new();
2103 options.insert(UInt(92));
2104 options.insert(UInt(97));
2105 options.insert(UInt(98));
2106 options.insert(UInt(99));
2107 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2108 inverted: false,
2109 endian: Endian::Big,
2110 options,
2111 ranges: vec![],
2112 input_length: bx!(,8)
2113 });
2114 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2115
2116 let input = "a8:abc0-9".chars().collect();
2117 let res = parse_char_class(&input);
2118 let mut options = HashSet::new();
2119 options.insert(UInt(97));
2120 options.insert(UInt(98));
2121 options.insert(UInt(99));
2122 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2123 inverted: false,
2124 endian: Endian::Big,
2125 options,
2126 ranges: vec![(UInt(48), UInt(57))],
2127 input_length: bx!(,8)
2128 });
2129 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2130
2131 let input = "abc0-9\\--z".chars().collect();
2132 let res = parse_char_class(&input);
2133 let mut options = HashSet::new();
2134 options.insert(UInt(97));
2135 options.insert(UInt(98));
2136 options.insert(UInt(99));
2137 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2138 inverted: false,
2139 endian: Endian::Big,
2140 options,
2141 ranges: vec![(UInt(48), UInt(57)), (UInt(45), UInt(122))],
2142 input_length: bx!(,8)
2143 });
2144 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2145
2146 let input = "a8:^0-9abc0-9d\\--zef".chars().collect();
2147 let res = parse_char_class(&input);
2148 let mut options = HashSet::new();
2149 options.insert(UInt(97));
2150 options.insert(UInt(98));
2151 options.insert(UInt(99));
2152 options.insert(UInt(100));
2153 options.insert(UInt(101));
2154 options.insert(UInt(102));
2155 let cls = DynamicCharacterClass::UInt(CharClass::<UInt>{
2156 inverted: true,
2157 endian: Endian::Big,
2158 options,
2159 ranges: vec![(UInt(48), UInt(57)), (UInt(48), UInt(57)), (UInt(45), UInt(122))],
2160 input_length: bx!(,8)
2161 });
2162 assert_eq!(res, Ok(Token::CharacterClass(std::rc::Rc::new(cls))));
2163 }
2164
2165 #[test]
2166 fn group_headers() {
2167 let input = "___''..".chars().collect::<Vec<char>>();
2168 assert_eq!(parse_group_header(&input), Ok((GroupType::Numbered, 0)));
2169 let input = "?:___''..".chars().collect::<Vec<char>>();
2170 assert_eq!(parse_group_header(&input), Ok((GroupType::NonCapturing, 2)));
2171 let input = "?<hello>___''..".chars().collect::<Vec<char>>();
2172 assert_eq!(parse_group_header(&input), Ok((GroupType::Named("hello".to_string()), 8)));
2173 }
2174
2175 #[test]
2176 fn groups_basic() {
2177 assert_eq!(BinRegex::new("_'.").unwrap().n_groups, 1);
2178 assert_eq!(BinRegex::new("(_'){5}_{2}").unwrap().n_groups, 2);
2179 assert_eq!(BinRegex::new("(_'){2,}_{2,}").unwrap().n_groups, 2);
2180 assert_eq!(BinRegex::new("(_'){2,4}_{1,10}").unwrap().n_groups, 2);
2181 assert_eq!(BinRegex::new("(_'){4,8}(_){1,2}").unwrap().n_groups, 3);
2182 assert_eq!(BinRegex::new("(_')+_+").unwrap().n_groups, 2);
2183 assert_eq!(BinRegex::new("((('')|(_'))*|_)*f0").unwrap().n_groups, 5);
2184 }
2185}