1use std::fmt;
14
15const MAX_PATTERN_SOURCE_BYTES: usize = 16 * 1024;
16const MAX_PATTERN_ATOMS: usize = u16::MAX as usize;
17const MAX_GROUP_ALTERNATIVES: usize = 1024;
18
19#[derive(Copy, Clone, Debug, Eq, PartialEq)]
21pub struct ParsePatError {
22 kind: PatError,
23 position: usize,
24}
25
26impl ParsePatError {
27 pub fn kind(self) -> PatError {
29 self.kind
30 }
31
32 pub fn position(self) -> usize {
34 self.position
35 }
36}
37
38impl fmt::Display for ParsePatError {
39 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
40 write!(
41 f,
42 "Syntax Error @{}: {}.",
43 self.position,
44 self.kind.as_str()
45 )
46 }
47}
48
49impl std::error::Error for ParsePatError {}
50
51#[derive(Copy, Clone, Debug, Eq, PartialEq)]
53pub enum PatError {
54 UnpairedHexDigit,
55 UnknownChar,
56 UnclosedQuote,
57 SaveOverflow,
58 ReadOperand,
59 SkipOperand,
60 GroupOperand,
61 StackError,
62 StackInvalid,
63 PatternTooLong,
64 PatternTooComplex,
65}
66
67impl PatError {
68 fn as_str(self) -> &'static str {
69 match self {
70 PatError::UnpairedHexDigit => "unpaired hex digit",
71 PatError::UnknownChar => "unknown character",
72 PatError::UnclosedQuote => "string missing end quote",
73 PatError::SaveOverflow => "save store overflow",
74 PatError::ReadOperand => "read operand error",
75 PatError::SkipOperand => "skip operand error",
76 PatError::GroupOperand => "group operand error",
77 PatError::StackError => "stack unbalanced",
78 PatError::StackInvalid => "stack must follow jump",
79 PatError::PatternTooLong => "pattern input too long",
80 PatError::PatternTooComplex => "pattern expands beyond supported complexity limits",
81 }
82 }
83}
84
85#[derive(Copy, Clone, Debug, Eq, PartialEq)]
87pub enum Atom {
88 Byte(u8),
90 Fuzzy(u8),
92 Save(u8),
94 Skip(u8),
96 SkipRange(u16, u16),
98 Push(u8),
100 Pop,
102 Jump1,
104 Jump4,
106 Ptr,
108 Pir(u8),
110 ReadI8(u8),
112 ReadU8(u8),
114 ReadI16(u8),
116 ReadU16(u8),
118 ReadI32(u8),
120 ReadU32(u8),
122 Zero(u8),
124 Back(u8),
126 Aligned(u8),
128 Check(u8),
130 Case(u16),
132 Break(u16),
134 Nop,
136}
137
138pub type Pattern = Vec<Atom>;
140
141pub fn save_len(pat: &[Atom]) -> usize {
143 pat.iter()
144 .filter_map(|atom| match atom {
145 Atom::Save(slot)
146 | Atom::ReadI8(slot)
147 | Atom::ReadU8(slot)
148 | Atom::ReadI16(slot)
149 | Atom::ReadU16(slot)
150 | Atom::ReadI32(slot)
151 | Atom::ReadU32(slot)
152 | Atom::Zero(slot)
153 | Atom::Check(slot)
154 | Atom::Pir(slot) => Some(usize::from(*slot) + 1),
155 _ => None,
156 })
157 .max()
158 .unwrap_or(0)
159}
160
161pub fn parse(pat: &str) -> Result<Pattern, ParsePatError> {
350 if pat.len() > MAX_PATTERN_SOURCE_BYTES {
351 return Err(ParsePatError {
352 kind: PatError::PatternTooLong,
353 position: MAX_PATTERN_SOURCE_BYTES,
354 });
355 }
356
357 let mut parser = Parser::new(pat);
358 let mut result = Vec::with_capacity(pat.len() / 2);
359 result.push(Atom::Save(0));
360 result.append(&mut parser.parse_sequence(&[])?);
361
362 if result.len() > MAX_PATTERN_ATOMS {
363 return Err(ParsePatError {
364 kind: PatError::PatternTooComplex,
365 position: pat.len(),
366 });
367 }
368
369 while matches!(result.last(), Some(Atom::Skip(_))) {
370 result.pop();
371 }
372
373 if parser.depth != 0 {
374 return Err(ParsePatError {
375 kind: PatError::StackError,
376 position: pat.len(),
377 });
378 }
379 if parser.peek().is_some() {
380 return Err(ParsePatError {
381 kind: PatError::GroupOperand,
382 position: parser.peek().map(|(pos, _)| pos).unwrap_or(pat.len()),
383 });
384 }
385
386 Ok(result)
387}
388
389struct Parser<'a> {
390 chars: Vec<(usize, char)>,
391 cursor: usize,
392 save: u8,
393 depth: u16,
394 _source: &'a str,
395}
396
397impl<'a> Parser<'a> {
398 fn new(source: &'a str) -> Self {
399 Self {
400 chars: source.char_indices().collect(),
401 cursor: 0,
402 save: 1,
403 depth: 0,
404 _source: source,
405 }
406 }
407
408 fn parse_sequence(&mut self, stoppers: &[char]) -> Result<Vec<Atom>, ParsePatError> {
409 let mut result = Vec::new();
410
411 while let Some((position, ch)) = self.peek() {
412 if stoppers.contains(&ch) {
413 break;
414 }
415
416 self.bump();
417 match ch {
418 ' ' | '\n' | '\r' | '\t' => {}
419 '?' => {
420 push_skip(&mut result, 1);
421 }
422 '[' => self.parse_skip_operand(position, &mut result)?,
423 '\'' => {
424 if self.save == u8::MAX {
425 return Err(ParsePatError {
426 kind: PatError::SaveOverflow,
427 position,
428 });
429 }
430 result.push(Atom::Save(self.save));
431 self.save += 1;
432 }
433 '%' => result.push(Atom::Jump1),
434 '$' => result.push(Atom::Jump4),
435 '*' => result.push(Atom::Ptr),
436 '"' => {
437 let mut closed = false;
438 while let Some((_, next)) = self.bump() {
439 if next == '"' {
440 closed = true;
441 break;
442 }
443 if !next.is_ascii() {
444 return Err(ParsePatError {
445 kind: PatError::UnknownChar,
446 position,
447 });
448 }
449 result.push(Atom::Byte(next as u8));
450 }
451 if !closed {
452 return Err(ParsePatError {
453 kind: PatError::UnclosedQuote,
454 position,
455 });
456 }
457 }
458 '{' => {
459 if self.depth == u16::MAX {
460 return Err(ParsePatError {
461 kind: PatError::StackError,
462 position,
463 });
464 }
465 self.depth += 1;
466 let Some(last) = result.last_mut() else {
467 return Err(ParsePatError {
468 kind: PatError::StackInvalid,
469 position,
470 });
471 };
472 let replaced = match *last {
473 Atom::Jump1 => {
474 *last = Atom::Push(1);
475 Atom::Jump1
476 }
477 Atom::Jump4 => {
478 *last = Atom::Push(4);
479 Atom::Jump4
480 }
481 Atom::Ptr => {
482 *last = Atom::Push(0);
483 Atom::Ptr
484 }
485 _ => {
486 return Err(ParsePatError {
487 kind: PatError::StackInvalid,
488 position,
489 });
490 }
491 };
492 result.push(replaced);
493 }
494 '}' => {
495 if self.depth == 0 {
496 return Err(ParsePatError {
497 kind: PatError::StackError,
498 position,
499 });
500 }
501 self.depth -= 1;
502 result.push(Atom::Pop);
503 }
504 'i' | 'u' => {
505 let signed = ch == 'i';
506 let (_, op) = self.bump().ok_or(ParsePatError {
507 kind: PatError::ReadOperand,
508 position,
509 })?;
510 if self.save == u8::MAX {
511 return Err(ParsePatError {
512 kind: PatError::SaveOverflow,
513 position,
514 });
515 }
516 let slot = self.save;
517 self.save += 1;
518 let atom = match (signed, op) {
519 (true, '1') => Atom::ReadI8(slot),
520 (false, '1') => Atom::ReadU8(slot),
521 (true, '2') => Atom::ReadI16(slot),
522 (false, '2') => Atom::ReadU16(slot),
523 (true, '4') => Atom::ReadI32(slot),
524 (false, '4') => Atom::ReadU32(slot),
525 _ => {
526 return Err(ParsePatError {
527 kind: PatError::ReadOperand,
528 position,
529 });
530 }
531 };
532 result.push(atom);
533 }
534 'z' => {
535 if self.save == u8::MAX {
536 return Err(ParsePatError {
537 kind: PatError::SaveOverflow,
538 position,
539 });
540 }
541 result.push(Atom::Zero(self.save));
542 self.save += 1;
543 }
544 '@' => {
545 let (next_pos, op) = self.bump().ok_or(ParsePatError {
546 kind: PatError::ReadOperand,
547 position,
548 })?;
549 let align = match op {
550 '0'..='9' => op as u8 - b'0',
551 'A'..='Z' => 10 + (op as u8 - b'A'),
552 'a'..='z' => 10 + (op as u8 - b'a'),
553 _ => {
554 return Err(ParsePatError {
555 kind: PatError::ReadOperand,
556 position: next_pos,
557 });
558 }
559 };
560 result.push(Atom::Aligned(align));
561 }
562 '(' => {
563 let alts = self.parse_group(position)?;
564 result.append(&mut compile_alternatives(alts, position)?);
565 }
566 _ if ch.is_ascii_hexdigit() => {
567 let (next_position, lo_ch) = self.bump().ok_or(ParsePatError {
568 kind: PatError::UnpairedHexDigit,
569 position,
570 })?;
571 if !lo_ch.is_ascii_hexdigit() {
572 return Err(ParsePatError {
573 kind: PatError::UnpairedHexDigit,
574 position: next_position,
575 });
576 }
577 let hi = hex_value(ch).expect("ascii hex already validated");
578 let lo = hex_value(lo_ch).expect("ascii hex already validated");
579 result.push(Atom::Byte((hi << 4) | lo));
580 }
581 _ => {
582 return Err(ParsePatError {
583 kind: PatError::UnknownChar,
584 position,
585 });
586 }
587 }
588
589 if result.len() > MAX_PATTERN_ATOMS {
590 return Err(ParsePatError {
591 kind: PatError::PatternTooComplex,
592 position,
593 });
594 }
595 }
596
597 Ok(result)
598 }
599
600 fn parse_group(&mut self, position: usize) -> Result<Vec<Vec<Atom>>, ParsePatError> {
601 let mut alternatives = Vec::new();
602 let group_save = self.save;
603 let group_depth = self.depth;
604 let mut max_save = self.save;
605 loop {
606 self.save = group_save;
607 self.depth = group_depth;
608 let seq = self.parse_sequence(&['|', ')'])?;
609 max_save = max_save.max(self.save);
610 alternatives.push(seq);
611 if alternatives.len() > MAX_GROUP_ALTERNATIVES {
612 return Err(ParsePatError {
613 kind: PatError::PatternTooComplex,
614 position,
615 });
616 }
617
618 let Some((stop_pos, stop)) = self.bump() else {
619 return Err(ParsePatError {
620 kind: PatError::GroupOperand,
621 position,
622 });
623 };
624
625 match stop {
626 '|' => continue,
627 ')' => break,
628 _ => {
629 return Err(ParsePatError {
630 kind: PatError::GroupOperand,
631 position: stop_pos,
632 });
633 }
634 }
635 }
636
637 if alternatives.is_empty() {
638 return Err(ParsePatError {
639 kind: PatError::GroupOperand,
640 position,
641 });
642 }
643
644 self.save = max_save;
645 self.depth = group_depth;
646
647 Ok(alternatives)
648 }
649
650 fn parse_skip_operand(
651 &mut self,
652 position: usize,
653 result: &mut Vec<Atom>,
654 ) -> Result<(), ParsePatError> {
655 let mut first: u32 = 0;
656 let mut second: u32 = 0;
657 let mut saw_first = false;
658 let mut saw_second = false;
659 let mut ranged = false;
660
661 while let Some((_, ch)) = self.bump() {
662 match ch {
663 '0'..='9' => {
664 let digit = u32::from(ch as u8 - b'0');
665 if !ranged {
666 saw_first = true;
667 first = first
668 .checked_mul(10)
669 .and_then(|n| n.checked_add(digit))
670 .ok_or(ParsePatError {
671 kind: PatError::SkipOperand,
672 position,
673 })?;
674 } else {
675 saw_second = true;
676 second = second
677 .checked_mul(10)
678 .and_then(|n| n.checked_add(digit))
679 .ok_or(ParsePatError {
680 kind: PatError::SkipOperand,
681 position,
682 })?;
683 }
684 }
685 '-' => {
686 if ranged || !saw_first {
687 return Err(ParsePatError {
688 kind: PatError::SkipOperand,
689 position,
690 });
691 }
692 ranged = true;
693 }
694 ']' => {
695 if !saw_first {
696 return Err(ParsePatError {
697 kind: PatError::SkipOperand,
698 position,
699 });
700 }
701 if !ranged {
702 push_skip(result, first);
703 return Ok(());
704 }
705 if !saw_second || second <= first {
706 return Err(ParsePatError {
707 kind: PatError::SkipOperand,
708 position,
709 });
710 }
711 let min = u16::try_from(first).map_err(|_| ParsePatError {
712 kind: PatError::SkipOperand,
713 position,
714 })?;
715 let max = u16::try_from(second - 1).map_err(|_| ParsePatError {
716 kind: PatError::SkipOperand,
717 position,
718 })?;
719 result.push(Atom::SkipRange(min, max));
720 return Ok(());
721 }
722 _ => {
723 return Err(ParsePatError {
724 kind: PatError::SkipOperand,
725 position,
726 });
727 }
728 }
729 }
730
731 Err(ParsePatError {
732 kind: PatError::SkipOperand,
733 position,
734 })
735 }
736
737 fn peek(&self) -> Option<(usize, char)> {
738 self.chars.get(self.cursor).copied()
739 }
740
741 fn bump(&mut self) -> Option<(usize, char)> {
742 let value = self.peek()?;
743 self.cursor += 1;
744 Some(value)
745 }
746}
747
748fn compile_alternatives(
749 mut alts: Vec<Vec<Atom>>,
750 position: usize,
751) -> Result<Vec<Atom>, ParsePatError> {
752 debug_assert!(!alts.is_empty(), "alternatives parser guarantees non-empty");
753 if alts.len() == 1 {
754 let only = alts
755 .pop()
756 .expect("alternatives parser guarantees exactly one arm remains");
757 return Ok(only);
758 }
759
760 let first = alts.remove(0);
761 let rest = compile_alternatives(alts, position)?;
762 let case_skip = u16::try_from(first.len() + 2).map_err(|_| ParsePatError {
763 kind: PatError::PatternTooComplex,
764 position,
765 })?;
766 let break_skip = u16::try_from(rest.len()).map_err(|_| ParsePatError {
767 kind: PatError::PatternTooComplex,
768 position,
769 })?;
770
771 let mut out = Vec::with_capacity(2 + first.len() + rest.len());
772 out.push(Atom::Case(case_skip));
773 out.extend(first);
774 out.push(Atom::Break(break_skip));
775 out.extend(rest);
776 if out.len() > MAX_PATTERN_ATOMS {
777 return Err(ParsePatError {
778 kind: PatError::PatternTooComplex,
779 position,
780 });
781 }
782 Ok(out)
783}
784
785fn hex_value(ch: char) -> Option<u8> {
786 match ch {
787 '0'..='9' => Some(ch as u8 - b'0'),
788 'a'..='f' => Some(ch as u8 - b'a' + 10),
789 'A'..='F' => Some(ch as u8 - b'A' + 10),
790 _ => None,
791 }
792}
793
794fn push_skip(result: &mut Vec<Atom>, mut remaining: u32) {
795 while remaining != 0 {
796 let chunk = u8::try_from(remaining.min(u32::from(u8::MAX))).expect("chunk is bounded");
797 remaining -= u32::from(chunk);
798
799 if let Some(Atom::Skip(prev)) = result.last_mut() {
800 let free = u8::MAX - *prev;
801 if free != 0 {
802 let to_add = free.min(chunk);
803 *prev += to_add;
804 let leftover = chunk - to_add;
805 if leftover != 0 {
806 result.push(Atom::Skip(leftover));
807 }
808 continue;
809 }
810 }
811
812 result.push(Atom::Skip(chunk));
813 }
814}
815
816#[cfg(test)]
817mod tests {
818 use proptest::prelude::*;
819
820 use super::{Atom, ParsePatError, PatError, parse};
821
822 #[test]
823 fn parses_used_subset() {
824 assert_eq!(
825 parse("488915${'} 488942"),
826 Ok(vec![
827 Atom::Save(0),
828 Atom::Byte(0x48),
829 Atom::Byte(0x89),
830 Atom::Byte(0x15),
831 Atom::Push(4),
832 Atom::Jump4,
833 Atom::Save(1),
834 Atom::Pop,
835 Atom::Byte(0x48),
836 Atom::Byte(0x89),
837 Atom::Byte(0x42),
838 ])
839 );
840
841 assert_eq!(
842 parse("68*'31c0c3"),
843 Ok(vec![
844 Atom::Save(0),
845 Atom::Byte(0x68),
846 Atom::Ptr,
847 Atom::Save(1),
848 Atom::Byte(0x31),
849 Atom::Byte(0xc0),
850 Atom::Byte(0xc3),
851 ])
852 );
853
854 assert_eq!(
855 parse("*{'90}"),
856 Ok(vec![
857 Atom::Save(0),
858 Atom::Push(0),
859 Atom::Ptr,
860 Atom::Save(1),
861 Atom::Byte(0x90),
862 Atom::Pop,
863 ])
864 );
865
866 assert_eq!(
867 parse("44 8B 81 u4 48 8D 0D"),
868 Ok(vec![
869 Atom::Save(0),
870 Atom::Byte(0x44),
871 Atom::Byte(0x8B),
872 Atom::Byte(0x81),
873 Atom::ReadU32(1),
874 Atom::Byte(0x48),
875 Atom::Byte(0x8D),
876 Atom::Byte(0x0D),
877 ])
878 );
879
880 assert_eq!(
881 parse("488b1d${'} 48891d[4] 4c63b3"),
882 Ok(vec![
883 Atom::Save(0),
884 Atom::Byte(0x48),
885 Atom::Byte(0x8b),
886 Atom::Byte(0x1d),
887 Atom::Push(4),
888 Atom::Jump4,
889 Atom::Save(1),
890 Atom::Pop,
891 Atom::Byte(0x48),
892 Atom::Byte(0x89),
893 Atom::Byte(0x1d),
894 Atom::Skip(4),
895 Atom::Byte(0x4c),
896 Atom::Byte(0x63),
897 Atom::Byte(0xb3),
898 ])
899 );
900
901 assert_eq!(
902 parse("\"hello\" 00"),
903 Ok(vec![
904 Atom::Save(0),
905 Atom::Byte(b'h'),
906 Atom::Byte(b'e'),
907 Atom::Byte(b'l'),
908 Atom::Byte(b'l'),
909 Atom::Byte(b'o'),
910 Atom::Byte(0x00),
911 ])
912 );
913 }
914
915 #[test]
916 fn reports_error_position() {
917 assert_eq!(
918 parse("4G") as Result<Vec<Atom>, ParsePatError>,
919 Err(ParsePatError {
920 kind: PatError::UnpairedHexDigit,
921 position: 1,
922 })
923 );
924
925 assert_eq!(
926 parse("u8") as Result<Vec<Atom>, ParsePatError>,
927 Err(ParsePatError {
928 kind: PatError::ReadOperand,
929 position: 0,
930 })
931 );
932
933 assert_eq!(
934 parse("\"unterminated") as Result<Vec<Atom>, ParsePatError>,
935 Err(ParsePatError {
936 kind: PatError::UnclosedQuote,
937 position: 0,
938 })
939 );
940 }
941
942 #[test]
943 fn group_alternatives_reset_and_merge_save_state() {
944 assert_eq!(
945 parse("('41|'42)'"),
946 Ok(vec![
947 Atom::Save(0),
948 Atom::Case(4),
949 Atom::Save(1),
950 Atom::Byte(0x41),
951 Atom::Break(2),
952 Atom::Save(1),
953 Atom::Byte(0x42),
954 Atom::Save(2),
955 ])
956 );
957 }
958
959 #[test]
960 fn range_skip_uses_strict_upper_bound() {
961 assert_eq!(
962 parse("[5-6]"),
963 Ok(vec![Atom::Save(0), Atom::SkipRange(5, 5)])
964 );
965
966 assert_eq!(
967 parse("[5-5]") as Result<Vec<Atom>, ParsePatError>,
968 Err(ParsePatError {
969 kind: PatError::SkipOperand,
970 position: 0,
971 })
972 );
973 }
974
975 #[test]
976 fn wildcard_semantics_match_pelite() {
977 assert_eq!(
978 parse("A?") as Result<Vec<Atom>, ParsePatError>,
979 Err(ParsePatError {
980 kind: PatError::UnpairedHexDigit,
981 position: 1,
982 })
983 );
984
985 assert_eq!(parse("?"), Ok(vec![Atom::Save(0)]));
986 assert_eq!(parse("??"), Ok(vec![Atom::Save(0)]));
987
988 assert_eq!(
989 parse("4183?03"),
990 Ok(vec![
991 Atom::Save(0),
992 Atom::Byte(0x41),
993 Atom::Byte(0x83),
994 Atom::Skip(1),
995 Atom::Byte(0x03),
996 ])
997 );
998 }
999
1000 #[test]
1001 fn supports_aligned_base36_syntax() {
1002 assert_eq!(parse("@4"), Ok(vec![Atom::Save(0), Atom::Aligned(4)]));
1003 assert_eq!(parse("@A"), Ok(vec![Atom::Save(0), Atom::Aligned(10)]));
1004 assert_eq!(parse("@f"), Ok(vec![Atom::Save(0), Atom::Aligned(15)]));
1005 assert_eq!(parse("@z"), Ok(vec![Atom::Save(0), Atom::Aligned(35)]));
1006
1007 assert_eq!(
1008 parse("@") as Result<Vec<Atom>, ParsePatError>,
1009 Err(ParsePatError {
1010 kind: PatError::ReadOperand,
1011 position: 0,
1012 })
1013 );
1014 assert_eq!(
1015 parse("@_") as Result<Vec<Atom>, ParsePatError>,
1016 Err(ParsePatError {
1017 kind: PatError::ReadOperand,
1018 position: 1,
1019 })
1020 );
1021 assert_eq!(
1022 parse("@?") as Result<Vec<Atom>, ParsePatError>,
1023 Err(ParsePatError {
1024 kind: PatError::ReadOperand,
1025 position: 1,
1026 })
1027 );
1028 }
1029
1030 #[test]
1031 fn save_len_counts_programmatic_pir_slots() {
1032 let pat = [Atom::Save(0), Atom::Pir(3)];
1033 assert_eq!(super::save_len(&pat), 4);
1034 }
1035
1036 proptest! {
1037 #[test]
1038 fn parsed_patterns_preserve_base_capture_and_slot_bounds(source in "[ -~]{0,128}") {
1039 if let Ok(atoms) = parse(&source) {
1040 prop_assert!(!atoms.is_empty());
1041 prop_assert_eq!(atoms[0], Atom::Save(0));
1042
1043 let required_slots = super::save_len(&atoms);
1044 prop_assert!(required_slots >= 1);
1045 prop_assert!(required_slots <= usize::from(u8::MAX));
1046 }
1047 }
1048 }
1049}