1use alloc::vec::Vec;
2use core::num::ParseIntError;
3
4use super::{
5 Lexer,
6 PositionedError,
7 Token,
8};
9use crate::{
10 pattern::OwnedBinaryPattern,
11 Atom,
12 JumpType,
13 ReadWidth,
14};
15
16#[derive(Debug, PartialEq, Eq, Clone)]
17pub enum ParseError {
18 UnexpectedToken,
19 UnexpectedEnd,
20
21 MaskByteLenMismatch,
22
23 HexValueInvalid,
24 HexValueIncomplete,
25
26 GroupNotClosed,
27 BlockNotClosed,
28
29 RangeBoundInvalid(ParseIntError),
30 RangeEndMustBeGraterThenStart,
31
32 SequenceTooLarge,
33}
34
35pub struct PatternParser<'a> {
36 lexer: Lexer<'a>,
37 peeked_token: Option<Token<'a>>,
38
39 byte_sequence: Vec<u8>,
40 atoms: Vec<Atom>,
41}
42
43impl<'a> PatternParser<'a> {
44 pub fn new(input: &'a str) -> Self {
45 Self {
46 lexer: Lexer::new(input),
47 peeked_token: None,
48
49 atoms: Vec::with_capacity(128),
50 byte_sequence: Vec::with_capacity(256),
51 }
52 }
53
54 fn peek_token(&mut self) -> Option<&Token<'a>> {
55 if self.peeked_token.is_none() {
56 self.peeked_token = self.lexer.next_token();
57 }
58
59 self.peeked_token.as_ref()
60 }
61
62 fn pop_token(&mut self) -> Result<Token<'a>, PositionedError<ParseError>> {
63 if let Some(token) = self.peeked_token.take() {
64 Ok(token)
65 } else if let Some(token) = self.lexer.next_token() {
66 Ok(token)
67 } else {
68 Err(PositionedError::new(
69 self.lexer.token_range(),
70 ParseError::UnexpectedEnd,
71 ))
72 }
73 }
74
75 pub fn parse(mut self) -> Result<OwnedBinaryPattern, PositionedError<ParseError>> {
76 let _ = self.parse_until(|_| false)?;
78 Ok(OwnedBinaryPattern::new(self.atoms, self.byte_sequence))
79 }
80
81 fn parse_until(
82 &mut self,
83 matcher: impl Fn(&Token<'a>) -> bool,
84 ) -> Result<bool, PositionedError<ParseError>> {
85 while let Some(token) = self.peek_token() {
86 if matcher(token) {
87 return Ok(true);
88 }
89
90 match token {
91 Token::Text(_) => self.parse_byte_sequence()?,
92 Token::Whildcard => self.parse_wildcard()?,
93
94 Token::PositionSave => self.parse_position_save()?,
95
96 Token::JumpRel1 => self.parse_jump()?,
97 Token::JumpRel4 => self.parse_jump()?,
98 Token::JumpAbs64 => self.parse_jump()?,
99
100 Token::Read1 => self.parse_read()?,
101 Token::Read2 => self.parse_read()?,
102 Token::Read4 => self.parse_read()?,
103
104 Token::GroupOpen => self.parse_group()?,
105 Token::RangeOpen => self.parse_range()?,
106
107 _ => {
108 return Err(PositionedError::new(
109 self.lexer.token_range(),
110 ParseError::UnexpectedToken,
111 ))
112 }
113 }
114 }
115
116 Ok(false)
117 }
118
119 fn parse_bytes(&mut self) -> Result<(u16, u16), PositionedError<ParseError>> {
120 let Token::Text(value) = self.pop_token()? else {
121 return Err(PositionedError::new(
122 self.lexer.token_range(),
123 ParseError::UnexpectedToken,
124 ));
125 };
126
127 let token_range = self.lexer.token_range();
128 let bytes_start = self.byte_sequence.len();
129 let mut values = value.char_indices();
130 while let Some((upper_index, upper)) = values.next() {
131 let Some((lower_index, lower)) = values.next() else {
132 return Err(PositionedError::new(
134 token_range.start + upper_index..token_range.start + upper_index + 1,
135 ParseError::HexValueIncomplete,
136 ));
137 };
138
139 let Some(upper) = upper.to_digit(16) else {
140 return Err(if upper_index == 0 {
141 PositionedError::new(token_range, ParseError::UnexpectedToken)
143 } else {
144 PositionedError::new(
145 token_range.start + upper_index..token_range.start + lower_index + 1,
146 ParseError::HexValueInvalid,
147 )
148 });
149 };
150
151 let Some(lower) = lower.to_digit(16) else {
152 return Err(PositionedError::new(
153 token_range.start + upper_index..token_range.start + lower_index + 1,
154 ParseError::HexValueInvalid,
155 ));
156 };
157
158 self.byte_sequence.push((upper as u8) << 4 | (lower as u8));
159 }
160
161 let bytes_end = self.byte_sequence.len();
162 if bytes_start > u16::MAX as usize || bytes_end > u16::MAX as usize {
163 return Err(PositionedError::new(
164 token_range,
165 ParseError::SequenceTooLarge,
166 ));
167 }
168
169 Ok((bytes_start as u16, bytes_end as u16))
170 }
171
172 fn parse_byte_sequence(&mut self) -> Result<(), PositionedError<ParseError>> {
173 let (bytes_start, bytes_end) = self.parse_bytes()?;
174
175 if let Some(Token::Mask) = self.peek_token() {
176 let _ = self.pop_token()?;
178
179 let (mask_start, mask_end) = self.parse_bytes()?;
180 let mask_length = mask_end - mask_start;
181 let bytes_length = bytes_end - bytes_start;
182 if mask_length != bytes_length {
183 return Err(PositionedError::new(
184 self.lexer.token_range(),
185 ParseError::MaskByteLenMismatch,
186 ));
187 }
188
189 self.atoms.push(Atom::ByteSequenceMasked {
190 seq_start: bytes_start,
191 mask_start,
192 len: bytes_length,
193 });
194 } else {
195 self.atoms.push(Atom::ByteSequence {
197 seq_start: bytes_start,
198 seq_end: bytes_end,
199 });
200 }
201
202 Ok(())
203 }
204
205 fn parse_position_save(&mut self) -> Result<(), PositionedError<ParseError>> {
206 let Token::PositionSave = self.pop_token()? else {
207 return Err(PositionedError::new(
208 self.lexer.token_range(),
209 ParseError::UnexpectedToken,
210 ));
211 };
212
213 self.atoms.push(Atom::SaveCursor);
214 Ok(())
215 }
216
217 fn parse_wildcard(&mut self) -> Result<(), PositionedError<ParseError>> {
218 let Token::Whildcard = self.pop_token()? else {
219 return Err(PositionedError::new(
220 self.lexer.token_range(),
221 ParseError::UnexpectedToken,
222 ));
223 };
224
225 self.atoms.push(Atom::WildcardFixed(1));
226 Ok(())
227 }
228
229 fn parse_read(&mut self) -> Result<(), PositionedError<ParseError>> {
230 let read_width = match self.pop_token()? {
231 Token::Read1 => ReadWidth::Byte,
232 Token::Read2 => ReadWidth::Word,
233 Token::Read4 => ReadWidth::DWord,
234 _ => {
235 return Err(PositionedError::new(
236 self.lexer.token_range(),
237 ParseError::UnexpectedToken,
238 ))
239 }
240 };
241
242 self.atoms.push(Atom::Read(read_width));
243 Ok(())
244 }
245
246 fn parse_jump(&mut self) -> Result<(), PositionedError<ParseError>> {
247 let (jump_type, width) = match self.pop_token()? {
248 Token::JumpRel1 => (JumpType::RelByte, 1),
249 Token::JumpRel4 => (JumpType::RelDWord, 4),
250 Token::JumpAbs64 => (JumpType::AbsQWord, 8),
251 _ => {
252 return Err(PositionedError::new(
253 self.lexer.token_range(),
254 ParseError::UnexpectedToken,
255 ))
256 }
257 };
258
259 if matches!(self.peek_token(), Some(Token::BlockOpen)) {
260 let _ = self.pop_token()?;
261 self.atoms.push(Atom::CursorPush);
262 self.atoms.push(Atom::Jump(jump_type));
263
264 let block_start = self.lexer.token_range();
265 if !self.parse_until(|token| matches!(token, Token::BlockClose))? {
266 return Err(PositionedError::new(
267 block_start,
268 ParseError::BlockNotClosed,
269 ));
270 }
271
272 self.atoms.push(Atom::CursorPop { advance: width });
273 let _ = self.pop_token()?;
274 } else {
275 self.atoms.push(Atom::Jump(jump_type));
276 }
277
278 Ok(())
279 }
280
281 fn parse_group(&mut self) -> Result<(), PositionedError<ParseError>> {
282 let Token::GroupOpen = self.pop_token()? else {
283 return Err(PositionedError::new(
284 self.lexer.token_range(),
285 ParseError::UnexpectedToken,
286 ));
287 };
288
289 let group_start = self.lexer.token_range();
290 let mut branch_atoms = Vec::with_capacity(8);
291 loop {
292 let branch_atom_index = self.atoms.len();
293 self.atoms.push(Atom::Branch {
294 left_len: 0,
295 right_len: 0,
296 });
297
298 if !self.parse_until(|token| matches!(token, Token::GroupClose | Token::GroupPipe))? {
299 return Err(PositionedError::new(
300 group_start,
301 ParseError::GroupNotClosed,
302 ));
303 }
304
305 if matches!(self.pop_token()?, Token::GroupClose) {
306 self.atoms.remove(branch_atom_index);
308 break;
309 }
310
311 let left_branch_len = self.atoms.len() - branch_atom_index - 1;
312 if left_branch_len > u16::MAX as usize {
313 return Err(PositionedError::new(
314 self.lexer.token_range(),
315 ParseError::SequenceTooLarge,
316 ));
317 }
318
319 if let Atom::Branch { left_len, .. } = &mut self.atoms[branch_atom_index] {
320 *left_len = left_branch_len as u16;
321 } else {
322 unreachable!("atom should be a branch");
323 }
324
325 branch_atoms.push(branch_atom_index);
326 }
327
328 let atom_count = self.atoms.len();
329 for branch_atom_index in branch_atoms {
330 if let Atom::Branch {
331 left_len,
332 right_len,
333 } = &mut self.atoms[branch_atom_index]
334 {
335 let right_branch_len = atom_count - *left_len as usize - branch_atom_index - 1;
336 if right_branch_len > u16::MAX as usize {
337 return Err(PositionedError::new(
338 self.lexer.token_range(),
339 ParseError::SequenceTooLarge,
340 ));
341 }
342
343 *right_len = right_branch_len as u16;
344 } else {
345 unreachable!("atom should be a branch");
346 }
347 }
348
349 Ok(())
350 }
351
352 fn parse_range(&mut self) -> Result<(), PositionedError<ParseError>> {
353 let Token::RangeOpen = self.pop_token()? else {
354 return Err(PositionedError::new(
355 self.lexer.token_range(),
356 ParseError::UnexpectedToken,
357 ));
358 };
359
360 let Token::Text(range_start) = self.pop_token()? else {
361 return Err(PositionedError::new(
362 self.lexer.token_range(),
363 ParseError::UnexpectedToken,
364 ));
365 };
366
367 let range_start = range_start.parse::<u16>().map_err(|err| {
368 PositionedError::new(self.lexer.token_range(), ParseError::RangeBoundInvalid(err))
369 })?;
370
371 match self.pop_token()? {
372 Token::RangeClose => {
373 self.atoms.push(Atom::WildcardFixed(range_start));
374 Ok(())
375 }
376 Token::RangeSeperator => {
377 let Token::Text(range_end) = self.pop_token()? else {
378 return Err(PositionedError::new(
379 self.lexer.token_range(),
380 ParseError::UnexpectedToken,
381 ));
382 };
383
384 let range_end = range_end.parse::<u16>().map_err(|err| {
385 PositionedError::new(
386 self.lexer.token_range(),
387 ParseError::RangeBoundInvalid(err),
388 )
389 })?;
390
391 if range_end <= range_start {
392 return Err(PositionedError::new(
393 self.lexer.token_range(),
394 ParseError::RangeEndMustBeGraterThenStart,
395 ));
396 }
397
398 self.atoms.push(Atom::WildcardRange {
399 min: range_start,
400 max: range_end,
401 });
402 if !matches!(self.pop_token()?, Token::RangeClose) {
403 Err(PositionedError::new(
404 self.lexer.token_range(),
405 ParseError::UnexpectedToken,
406 ))
407 } else {
408 Ok(())
409 }
410 }
411 _ => Err(PositionedError::new(
412 self.lexer.token_range(),
413 ParseError::UnexpectedToken,
414 )),
415 }
416 }
417}
418
419pub fn parse_pattern(pattern: &str) -> Result<OwnedBinaryPattern, PositionedError<ParseError>> {
421 let parser = PatternParser::new(pattern);
422 parser.parse()
423}
424
425#[cfg(test)]
426mod test {
427 use super::PatternParser;
428 use crate::{
429 compiler::{
430 parser::ParseError,
431 PositionedError,
432 },
433 pattern::BinaryPattern,
434 Atom,
435 JumpType,
436 };
437
438 #[test]
439 fn test_byte_sequence() {
440 {
441 let parser = PatternParser::new("FF 00 12");
442 let result = parser.parse().unwrap();
443 assert_eq!(
444 result.atoms(),
445 &[
446 Atom::ByteSequence {
447 seq_start: 0,
448 seq_end: 1
449 },
450 Atom::ByteSequence {
451 seq_start: 1,
452 seq_end: 2
453 },
454 Atom::ByteSequence {
455 seq_start: 2,
456 seq_end: 3
457 }
458 ]
459 );
460 assert_eq!(result.byte_sequence(), &[0xFF, 0x00, 0x12]);
461 }
462
463 {
464 let parser = PatternParser::new("FF00 12");
465 let result = parser.parse().unwrap();
466 assert_eq!(
467 result.atoms(),
468 &[
469 Atom::ByteSequence {
470 seq_start: 0,
471 seq_end: 2
472 },
473 Atom::ByteSequence {
474 seq_start: 2,
475 seq_end: 3
476 }
477 ]
478 );
479 assert_eq!(result.byte_sequence(), &[0xFF, 0x00, 0x12]);
480 }
481
482 {
483 let parser = PatternParser::new("FF0");
484 let result = parser.parse().unwrap_err();
485 assert_eq!(
486 &result,
487 &PositionedError::new(2..3, ParseError::HexValueIncomplete)
488 );
489 }
490
491 {
492 let parser = PatternParser::new("FX");
493 let result = parser.parse().unwrap_err();
494 assert_eq!(
495 &result,
496 &PositionedError::new(0..2, ParseError::HexValueInvalid)
497 );
498 }
499 }
500
501 #[test]
502 fn test_byte_sequence_mask() {
503 {
504 let parser = PatternParser::new("FF & F0");
505 let result = parser.parse().unwrap();
506 assert_eq!(
507 result.atoms(),
508 &[Atom::ByteSequenceMasked {
509 seq_start: 0,
510 mask_start: 1,
511 len: 1
512 }]
513 );
514 assert_eq!(result.byte_sequence(), &[0xFF, 0xF0]);
515 }
516
517 {
518 let parser = PatternParser::new("FFEE & F0F0");
519 let result = parser.parse().unwrap();
520 assert_eq!(
521 result.atoms(),
522 &[Atom::ByteSequenceMasked {
523 seq_start: 0,
524 mask_start: 2,
525 len: 2
526 }]
527 );
528 assert_eq!(result.byte_sequence(), &[0xFF, 0xEE, 0xF0, 0xF0]);
529 }
530
531 {
532 let parser = PatternParser::new("FFFF & FF");
533 let result = parser.parse().unwrap_err();
534 assert_eq!(
535 &result,
536 &PositionedError::new(7..9, ParseError::MaskByteLenMismatch)
537 );
538 }
539 }
540
541 #[test]
542 fn test_jump() {
543 {
544 let parser = PatternParser::new("%$* FF * % $");
545 let result = parser.parse().unwrap();
546 assert_eq!(
547 result.atoms(),
548 &[
549 Atom::Jump(JumpType::RelByte),
550 Atom::Jump(JumpType::RelDWord),
551 Atom::Jump(JumpType::AbsQWord),
552 Atom::ByteSequence {
553 seq_start: 0,
554 seq_end: 1
555 },
556 Atom::Jump(JumpType::AbsQWord),
557 Atom::Jump(JumpType::RelByte),
558 Atom::Jump(JumpType::RelDWord),
559 ]
560 );
561 assert_eq!(result.byte_sequence(), &[0xFF]);
562 }
563 }
564
565 #[test]
566 fn test_jump_block() {
567 {
568 let parser = PatternParser::new("$ { FE }");
569 let result = parser.parse().unwrap();
570 assert_eq!(
571 result.atoms(),
572 &[
573 Atom::CursorPush,
574 Atom::Jump(JumpType::RelDWord),
575 Atom::ByteSequence {
576 seq_start: 0,
577 seq_end: 1
578 },
579 Atom::CursorPop { advance: 4 },
580 ]
581 );
582 assert_eq!(result.byte_sequence(), &[0xFE]);
583 }
584
585 {
586 let parser = PatternParser::new("$ { FE $ { FF } }");
587 let result = parser.parse().unwrap();
588 assert_eq!(
589 result.atoms(),
590 &[
591 Atom::CursorPush,
592 Atom::Jump(JumpType::RelDWord),
593 Atom::ByteSequence {
594 seq_start: 0,
595 seq_end: 1
596 },
597 Atom::CursorPush,
598 Atom::Jump(JumpType::RelDWord),
599 Atom::ByteSequence {
600 seq_start: 1,
601 seq_end: 2
602 },
603 Atom::CursorPop { advance: 4 },
604 Atom::CursorPop { advance: 4 },
605 ]
606 );
607 assert_eq!(result.byte_sequence(), &[0xFE, 0xFF]);
608 }
609
610 {
611 let parser = PatternParser::new("$ { FE");
612 let result = parser.parse().unwrap_err();
613 assert_eq!(
614 &result,
615 &PositionedError::new(2..3, ParseError::BlockNotClosed)
616 );
617 }
618 }
619
620 #[test]
621 fn test_group() {
622 {
623 let parser = PatternParser::new("()");
625 let result = parser.parse().unwrap();
626 assert_eq!(result.atoms(), &[]);
627 assert_eq!(result.byte_sequence(), &[]);
628 }
629
630 {
631 let parser = PatternParser::new("( FF00 )");
633 let result = parser.parse().unwrap();
634 assert_eq!(
635 result.atoms(),
636 &[Atom::ByteSequence {
637 seq_start: 0,
638 seq_end: 2
639 }]
640 );
641 assert_eq!(result.byte_sequence(), &[0xFF, 0x00]);
642 }
643
644 {
645 let parser = PatternParser::new("( 01 | 02 03 )");
647 let result = parser.parse().unwrap();
648 assert_eq!(
649 result.atoms(),
650 &[
651 Atom::Branch {
652 left_len: 1,
653 right_len: 2
654 },
655 Atom::ByteSequence {
656 seq_start: 0,
657 seq_end: 1
658 },
659 Atom::ByteSequence {
660 seq_start: 1,
661 seq_end: 2
662 },
663 Atom::ByteSequence {
664 seq_start: 2,
665 seq_end: 3
666 }
667 ]
668 );
669 assert_eq!(result.byte_sequence(), &[0x01, 0x02, 0x03]);
670 }
671
672 {
673 let parser = PatternParser::new("( 01 | 02 03 | FF )");
675 let result = parser.parse().unwrap();
676 assert_eq!(
677 result.atoms(),
678 &[
679 Atom::Branch {
680 left_len: 1,
681 right_len: 4
682 },
683 Atom::ByteSequence {
684 seq_start: 0,
685 seq_end: 1
686 },
687 Atom::Branch {
688 left_len: 2,
689 right_len: 1
690 },
691 Atom::ByteSequence {
692 seq_start: 1,
693 seq_end: 2
694 },
695 Atom::ByteSequence {
696 seq_start: 2,
697 seq_end: 3
698 },
699 Atom::ByteSequence {
700 seq_start: 3,
701 seq_end: 4
702 }
703 ]
704 );
705 assert_eq!(result.byte_sequence(), &[0x01, 0x02, 0x03, 0xFF]);
706 }
707
708 {
709 let parser = PatternParser::new("( 01 | ( 02 | 03 ) )");
711 let result = parser.parse().unwrap();
712 assert_eq!(
713 result.atoms(),
714 &[
715 Atom::Branch {
716 left_len: 1,
717 right_len: 3
718 },
719 Atom::ByteSequence {
720 seq_start: 0,
721 seq_end: 1
722 },
723 Atom::Branch {
724 left_len: 1,
725 right_len: 1
726 },
727 Atom::ByteSequence {
728 seq_start: 1,
729 seq_end: 2
730 },
731 Atom::ByteSequence {
732 seq_start: 2,
733 seq_end: 3
734 }
735 ]
736 );
737 assert_eq!(result.byte_sequence(), &[0x01, 0x02, 0x03]);
738 }
739
740 {
741 let parser = PatternParser::new("( (01 | 02) | 03 )");
743 let result = parser.parse().unwrap();
744 assert_eq!(
745 result.atoms(),
746 &[
747 Atom::Branch {
748 left_len: 3,
749 right_len: 1
750 },
751 Atom::Branch {
752 left_len: 1,
753 right_len: 1
754 },
755 Atom::ByteSequence {
756 seq_start: 0,
757 seq_end: 1
758 },
759 Atom::ByteSequence {
760 seq_start: 1,
761 seq_end: 2
762 },
763 Atom::ByteSequence {
764 seq_start: 2,
765 seq_end: 3
766 },
767 ]
768 );
769 assert_eq!(result.byte_sequence(), &[0x01, 0x02, 0x03]);
770 }
771
772 {
773 let parser = PatternParser::new("(");
775 let result = parser.parse().unwrap_err();
776 assert_eq!(
777 &result,
778 &PositionedError::new(0..1, ParseError::GroupNotClosed)
779 );
780 }
781
782 {
783 let parser = PatternParser::new("( FF 00");
785 let result = parser.parse().unwrap_err();
786 assert_eq!(
787 &result,
788 &PositionedError::new(0..1, ParseError::GroupNotClosed)
789 );
790 }
791
792 {
793 let parser = PatternParser::new(" (|");
795 let result = parser.parse().unwrap_err();
796 assert_eq!(
797 &result,
798 &PositionedError::new(1..2, ParseError::GroupNotClosed)
799 );
800 }
801 }
802
803 #[test]
804 fn test_range() {
805 {
806 let parser = PatternParser::new("[0] [123]");
808 let result = parser.parse().unwrap();
809 assert_eq!(
810 result.atoms(),
811 &[Atom::WildcardFixed(0), Atom::WildcardFixed(123)]
812 );
813 assert_eq!(result.byte_sequence(), &[]);
814 }
815
816 {
817 let parser = PatternParser::new("[0-10] [123- 999]");
819 let result = parser.parse().unwrap();
820 assert_eq!(
821 result.atoms(),
822 &[
823 Atom::WildcardRange { min: 0, max: 10 },
824 Atom::WildcardRange { min: 123, max: 999 }
825 ]
826 );
827 assert_eq!(result.byte_sequence(), &[]);
828 }
829
830 {
831 let parser = PatternParser::new("[0-]");
833 let result = parser.parse().unwrap_err();
834 assert_eq!(
835 &result,
836 &PositionedError::new(3..4, ParseError::UnexpectedToken)
837 );
838 }
839
840 {
841 let parser = PatternParser::new("[FF 0-3]");
843 let result = parser.parse().unwrap_err();
844 assert_eq!(*result.position(), 1..3);
845 assert!(matches!(result.inner(), ParseError::RangeBoundInvalid(_)));
846 }
847
848 {
849 let parser = PatternParser::new("[0-3");
851 let result = parser.parse().unwrap_err();
852 assert_eq!(
853 &result,
854 &PositionedError::new(3..4, ParseError::UnexpectedEnd)
855 );
856 }
857
858 {
859 let parser = PatternParser::new("[3-0]");
861 let result = parser.parse().unwrap_err();
862 assert_eq!(
863 &result,
864 &PositionedError::new(3..4, ParseError::RangeEndMustBeGraterThenStart)
865 );
866 }
867 }
868}