Skip to main content

goblin_sigscan_pattern/
lib.rs

1//! Pelite-style pattern parser used by `goblin-sigscan`.
2//!
3//! This crate parses textual signatures into [`Atom`] instructions that can be
4//! executed by scanner implementations.
5//! For scanner-facing onboarding and end-to-end examples, see
6//! `goblin_sigscan` crate docs: <https://docs.rs/goblin-sigscan/latest/goblin_sigscan/>.
7//! The canonical syntax tutorial lives on [`parse`].
8//!
9//! Cross references:
10//! - scanner runtime APIs: `goblin_sigscan::Scanner` and `goblin_sigscan::Matches`
11//! - prepared scanning path: `goblin_sigscan::PreparedPattern`
12
13use std::fmt;
14
15const MAX_PATTERN_SOURCE_BYTES: usize = 16 * 1024;
16const MAX_PATTERN_ATOMS: usize = u16::MAX as usize;
17const MAX_GROUP_ALTERNATIVES: usize = 1024;
18
19/// Pattern parser error.
20#[derive(Copy, Clone, Debug, Eq, PartialEq)]
21pub struct ParsePatError {
22    kind: PatError,
23    position: usize,
24}
25
26impl ParsePatError {
27    /// Returns the error kind.
28    pub fn kind(self) -> PatError {
29        self.kind
30    }
31
32    /// Returns the byte offset where parsing failed.
33    pub fn position(self) -> usize {
34        self.position
35    }
36}
37
38impl fmt::Display for ParsePatError {
39    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
40        write!(
41            f,
42            "Syntax Error @{}: {}.",
43            self.position,
44            self.kind.as_str()
45        )
46    }
47}
48
49impl std::error::Error for ParsePatError {}
50
51/// Pattern parsing error categories.
52#[derive(Copy, Clone, Debug, Eq, PartialEq)]
53pub enum PatError {
54    UnpairedHexDigit,
55    UnknownChar,
56    UnclosedQuote,
57    SaveOverflow,
58    ReadOperand,
59    SkipOperand,
60    GroupOperand,
61    StackError,
62    StackInvalid,
63    PatternTooLong,
64    PatternTooComplex,
65}
66
67impl PatError {
68    fn as_str(self) -> &'static str {
69        match self {
70            PatError::UnpairedHexDigit => "unpaired hex digit",
71            PatError::UnknownChar => "unknown character",
72            PatError::UnclosedQuote => "string missing end quote",
73            PatError::SaveOverflow => "save store overflow",
74            PatError::ReadOperand => "read operand error",
75            PatError::SkipOperand => "skip operand error",
76            PatError::GroupOperand => "group operand error",
77            PatError::StackError => "stack unbalanced",
78            PatError::StackInvalid => "stack must follow jump",
79            PatError::PatternTooLong => "pattern input too long",
80            PatError::PatternTooComplex => "pattern expands beyond supported complexity limits",
81        }
82    }
83}
84
85/// Pattern atoms.
86#[derive(Copy, Clone, Debug, Eq, PartialEq)]
87pub enum Atom {
88    /// Matches a single byte.
89    Byte(u8),
90    /// Applies a bitmask to the next byte comparison.
91    Fuzzy(u8),
92    /// Captures the cursor RVA in the save slot.
93    Save(u8),
94    /// Skips a fixed number of bytes.
95    Skip(u8),
96    /// Skips a ranged number of bytes (inclusive).
97    SkipRange(u16, u16),
98    /// Executes a recursive sub-pattern and then resumes at `cursor + skip`.
99    Push(u8),
100    /// Returns from a recursive sub-pattern.
101    Pop,
102    /// Follows a signed 1-byte relative jump.
103    Jump1,
104    /// Follows a signed 4-byte relative jump.
105    Jump4,
106    /// Follows an absolute pointer.
107    Ptr,
108    /// Follows a position-independent reference from a saved base slot.
109    Pir(u8),
110    /// Reads and sign-extends the byte under the cursor, writes to slot, advances by 1.
111    ReadI8(u8),
112    /// Reads and zero-extends the byte under the cursor, writes to slot, advances by 1.
113    ReadU8(u8),
114    /// Reads a little-endian `i16`, sign-extends, stores in save slot, advances by 2.
115    ReadI16(u8),
116    /// Reads a little-endian `u16`, zero-extends, stores in save slot, advances by 2.
117    ReadU16(u8),
118    /// Reads a little-endian `i32`, sign-extends, stores in save slot, advances by 4.
119    ReadI32(u8),
120    /// Reads a little-endian `u32`, stores it in save slot, and advances the cursor by 4.
121    ReadU32(u8),
122    /// Writes zero to the given save slot without advancing the cursor.
123    Zero(u8),
124    /// Rewinds the cursor by a fixed number of bytes.
125    Back(u8),
126    /// Fails if the cursor is not aligned to `(1 << value)` bytes.
127    Aligned(u8),
128    /// Fails if the cursor does not equal the value in the given save slot.
129    Check(u8),
130    /// Branches to an alternate pattern arm on failure.
131    Case(u16),
132    /// Jumps past remaining alternate arms when current arm succeeds.
133    Break(u16),
134    /// No-op instruction used to keep pattern control-flow offsets stable.
135    Nop,
136}
137
138/// Patterns are a list of [`Atom`]s.
139pub type Pattern = Vec<Atom>;
140
141/// Returns the save array length required by the pattern.
142pub fn save_len(pat: &[Atom]) -> usize {
143    pat.iter()
144        .filter_map(|atom| match atom {
145            Atom::Save(slot)
146            | Atom::ReadI8(slot)
147            | Atom::ReadU8(slot)
148            | Atom::ReadI16(slot)
149            | Atom::ReadU16(slot)
150            | Atom::ReadI32(slot)
151            | Atom::ReadU32(slot)
152            | Atom::Zero(slot)
153            | Atom::Check(slot)
154            | Atom::Pir(slot) => Some(usize::from(*slot) + 1),
155            _ => None,
156        })
157        .max()
158        .unwrap_or(0)
159}
160
161/// Parses a pelite-style signature string into atoms.
162///
163/// Parsing injects an implicit `Save(0)` at the beginning so slot `0` always
164/// represents the match base cursor for parsed patterns.
165///
166/// This is the main runtime entry point for pattern text.
167///
168/// # Syntax tutorial
169///
170/// Following are examples of the syntax supported by `goblin-sigscan`.
171///
172/// ```text
173/// 55 89 e5 83 ? ec
174/// ```
175///
176/// Case-insensitive hexadecimal pairs match exact bytes and question marks are
177/// wildcard bytes.
178///
179/// A single `?` matches a full byte. Partial nibble masks are not currently
180/// supported.
181///
182/// Whitespace has no semantic meaning and is only for readability.
183///
184/// ```text
185/// b9 ' 37 13 00 00
186/// ```
187///
188/// A single quote (`'`) stores the current cursor into the next save slot.
189///
190/// Save slot ordering is deterministic:
191///
192/// - `save[0]` is always the overall match start (`Save(0)` injected by parser)
193/// - `save[1..]` are captures in order of appearance (`'`, `i*`, `u*`, `z`, ...)
194///
195/// ```text
196/// b8 [16] 50 [13-42] ff
197/// ```
198///
199/// Bracket operands skip bytes:
200///
201/// - `[N]` skips exactly `N` bytes
202/// - `[A-B]` tries the range non-greedily (smallest skip first)
203///
204/// Internally `[A-B]` compiles to `SkipRange(A, B - 1)`.
205///
206/// ```text
207/// 31 c0 74 % ' c3
208/// e8 $ ' 31 c0 c3
209/// ```
210///
211/// `%` follows a signed rel8 target and `$` follows a signed rel32 target.
212///
213/// This composes with captures and read ops to recover referenced addresses and
214/// values without manual offset arithmetic.
215///
216/// ```text
217/// e8 $ { ' } 83 f0 5c c3
218/// ```
219///
220/// Curly braces must follow `%`, `$`, or `*`. The sub-pattern inside `{...}` runs at
221/// the jump destination. After it succeeds, scanning returns to the original
222/// stream position, skips jump bytes, and continues.
223///
224/// ```text
225/// e8 $ @4
226/// ```
227///
228/// `@n` checks alignment at that point in the scan. Alignment is `1 << n`
229/// bytes, so `@4` means 16-byte alignment.
230///
231/// ```text
232/// e8 i1 a0 u4 z
233/// ```
234///
235/// `i`/`u` read memory into save slots and advance the cursor by operand size:
236///
237/// - signed reads: `i1`, `i2`, `i4`
238/// - unsigned reads: `u1`, `u2`, `u4`
239/// - `z` writes a literal zero to a fresh slot
240///
241/// ```text
242/// 83 c0 2a ( 6a ? | 68 ? ? ? ? ) e8
243/// ```
244///
245/// Parentheses define alternatives separated by `|`. Arms are attempted from
246/// left to right and the pattern fails only if every arm fails.
247///
248/// ```text
249/// b8 "MZ" 00
250/// ```
251///
252/// Double-quoted strings emit literal byte sequences.
253///
254/// ## Pelite compatibility notes
255///
256/// `goblin-sigscan` intentionally tracks a practical subset of pelite syntax.
257///
258/// - Supported: hex bytes, `?`, `'`, `%`, `$`, `*`, `{...}`, `[N]`, `[A-B]`, `@n`,
259///   `i1/i2/i4`, `u1/u2/u4`, `z`, alternation, and strings.
260/// - Programmatic-only atoms (not parser syntax): `Pir(slot)`.
261///
262/// # Save-slot semantics
263///
264/// `parse` always prepends `Save(0)`, so parsed patterns always require at
265/// least one save slot. Use [`save_len`] to allocate scanner buffers.
266///
267/// If you are calling scanner APIs from `goblin-sigscan`, this means `save[0]`
268/// is always the match start for parsed patterns.
269///
270/// # Examples
271///
272/// ```
273/// use goblin_sigscan_pattern::{Atom, parse, save_len};
274///
275/// let atoms = parse("48 8B ? ? ? ? 48 89")?;
276/// assert_eq!(atoms.first(), Some(&Atom::Save(0)));
277/// assert_eq!(save_len(&atoms), 1);
278/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
279/// ```
280///
281/// Capturing a jump target plus a post-jump cursor capture:
282///
283/// ```
284/// use goblin_sigscan_pattern::{Atom, parse, save_len};
285///
286/// let atoms = parse("e8 ${'}")?;
287/// assert!(matches!(atoms[0], Atom::Save(0)));
288/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Jump4)));
289/// assert!(save_len(&atoms) >= 2);
290/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
291/// ```
292///
293/// Group alternatives:
294///
295/// ```
296/// use goblin_sigscan_pattern::{Atom, parse};
297///
298/// let atoms = parse("(85 c0 | 48 85 c0)")?;
299/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Case(_))));
300/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Break(_))));
301/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
302/// ```
303///
304/// # Errors
305///
306/// Returns [`ParsePatError`] with:
307///
308/// - a kind ([`PatError`])
309/// - a byte position in the source string
310///
311/// Common error kinds include:
312///
313/// - [`PatError::UnpairedHexDigit`]
314/// - [`PatError::SkipOperand`]
315/// - [`PatError::ReadOperand`]
316/// - [`PatError::GroupOperand`]
317/// - [`PatError::PatternTooLong`]
318/// - [`PatError::PatternTooComplex`]
319///
320/// # Quick compile-checked examples
321///
322/// ```
323/// use goblin_sigscan_pattern::{Atom, parse, save_len};
324///
325/// let atoms = parse("48 8B ? ? ? ? 48 89")?;
326/// assert_eq!(atoms.first(), Some(&Atom::Save(0)));
327/// assert_eq!(save_len(&atoms), 1);
328/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
329/// ```
330///
331/// ```
332/// use goblin_sigscan_pattern::{Atom, parse, save_len};
333///
334/// let atoms = parse("e8 ${'}")?;
335/// assert!(matches!(atoms[0], Atom::Save(0)));
336/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Jump4)));
337/// assert!(save_len(&atoms) >= 2);
338/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
339/// ```
340///
341/// ```
342/// use goblin_sigscan_pattern::{Atom, parse};
343///
344/// let atoms = parse("(85 c0 | 48 85 c0)")?;
345/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Case(_))));
346/// assert!(atoms.iter().any(|atom| matches!(atom, Atom::Break(_))));
347/// # Ok::<(), goblin_sigscan_pattern::ParsePatError>(())
348/// ```
349pub fn parse(pat: &str) -> Result<Pattern, ParsePatError> {
350    if pat.len() > MAX_PATTERN_SOURCE_BYTES {
351        return Err(ParsePatError {
352            kind: PatError::PatternTooLong,
353            position: MAX_PATTERN_SOURCE_BYTES,
354        });
355    }
356
357    let mut parser = Parser::new(pat);
358    let mut result = Vec::with_capacity(pat.len() / 2);
359    result.push(Atom::Save(0));
360    result.append(&mut parser.parse_sequence(&[])?);
361
362    if result.len() > MAX_PATTERN_ATOMS {
363        return Err(ParsePatError {
364            kind: PatError::PatternTooComplex,
365            position: pat.len(),
366        });
367    }
368
369    while matches!(result.last(), Some(Atom::Skip(_))) {
370        result.pop();
371    }
372
373    if parser.depth != 0 {
374        return Err(ParsePatError {
375            kind: PatError::StackError,
376            position: pat.len(),
377        });
378    }
379    if parser.peek().is_some() {
380        return Err(ParsePatError {
381            kind: PatError::GroupOperand,
382            position: parser.peek().map(|(pos, _)| pos).unwrap_or(pat.len()),
383        });
384    }
385
386    Ok(result)
387}
388
389struct Parser<'a> {
390    chars: Vec<(usize, char)>,
391    cursor: usize,
392    save: u8,
393    depth: u16,
394    _source: &'a str,
395}
396
397impl<'a> Parser<'a> {
398    fn new(source: &'a str) -> Self {
399        Self {
400            chars: source.char_indices().collect(),
401            cursor: 0,
402            save: 1,
403            depth: 0,
404            _source: source,
405        }
406    }
407
408    fn parse_sequence(&mut self, stoppers: &[char]) -> Result<Vec<Atom>, ParsePatError> {
409        let mut result = Vec::new();
410
411        while let Some((position, ch)) = self.peek() {
412            if stoppers.contains(&ch) {
413                break;
414            }
415
416            self.bump();
417            match ch {
418                ' ' | '\n' | '\r' | '\t' => {}
419                '?' => {
420                    push_skip(&mut result, 1);
421                }
422                '[' => self.parse_skip_operand(position, &mut result)?,
423                '\'' => {
424                    if self.save == u8::MAX {
425                        return Err(ParsePatError {
426                            kind: PatError::SaveOverflow,
427                            position,
428                        });
429                    }
430                    result.push(Atom::Save(self.save));
431                    self.save += 1;
432                }
433                '%' => result.push(Atom::Jump1),
434                '$' => result.push(Atom::Jump4),
435                '*' => result.push(Atom::Ptr),
436                '"' => {
437                    let mut closed = false;
438                    while let Some((_, next)) = self.bump() {
439                        if next == '"' {
440                            closed = true;
441                            break;
442                        }
443                        if !next.is_ascii() {
444                            return Err(ParsePatError {
445                                kind: PatError::UnknownChar,
446                                position,
447                            });
448                        }
449                        result.push(Atom::Byte(next as u8));
450                    }
451                    if !closed {
452                        return Err(ParsePatError {
453                            kind: PatError::UnclosedQuote,
454                            position,
455                        });
456                    }
457                }
458                '{' => {
459                    if self.depth == u16::MAX {
460                        return Err(ParsePatError {
461                            kind: PatError::StackError,
462                            position,
463                        });
464                    }
465                    self.depth += 1;
466                    let Some(last) = result.last_mut() else {
467                        return Err(ParsePatError {
468                            kind: PatError::StackInvalid,
469                            position,
470                        });
471                    };
472                    let replaced = match *last {
473                        Atom::Jump1 => {
474                            *last = Atom::Push(1);
475                            Atom::Jump1
476                        }
477                        Atom::Jump4 => {
478                            *last = Atom::Push(4);
479                            Atom::Jump4
480                        }
481                        Atom::Ptr => {
482                            *last = Atom::Push(0);
483                            Atom::Ptr
484                        }
485                        _ => {
486                            return Err(ParsePatError {
487                                kind: PatError::StackInvalid,
488                                position,
489                            });
490                        }
491                    };
492                    result.push(replaced);
493                }
494                '}' => {
495                    if self.depth == 0 {
496                        return Err(ParsePatError {
497                            kind: PatError::StackError,
498                            position,
499                        });
500                    }
501                    self.depth -= 1;
502                    result.push(Atom::Pop);
503                }
504                'i' | 'u' => {
505                    let signed = ch == 'i';
506                    let (_, op) = self.bump().ok_or(ParsePatError {
507                        kind: PatError::ReadOperand,
508                        position,
509                    })?;
510                    if self.save == u8::MAX {
511                        return Err(ParsePatError {
512                            kind: PatError::SaveOverflow,
513                            position,
514                        });
515                    }
516                    let slot = self.save;
517                    self.save += 1;
518                    let atom = match (signed, op) {
519                        (true, '1') => Atom::ReadI8(slot),
520                        (false, '1') => Atom::ReadU8(slot),
521                        (true, '2') => Atom::ReadI16(slot),
522                        (false, '2') => Atom::ReadU16(slot),
523                        (true, '4') => Atom::ReadI32(slot),
524                        (false, '4') => Atom::ReadU32(slot),
525                        _ => {
526                            return Err(ParsePatError {
527                                kind: PatError::ReadOperand,
528                                position,
529                            });
530                        }
531                    };
532                    result.push(atom);
533                }
534                'z' => {
535                    if self.save == u8::MAX {
536                        return Err(ParsePatError {
537                            kind: PatError::SaveOverflow,
538                            position,
539                        });
540                    }
541                    result.push(Atom::Zero(self.save));
542                    self.save += 1;
543                }
544                '@' => {
545                    let (next_pos, op) = self.bump().ok_or(ParsePatError {
546                        kind: PatError::ReadOperand,
547                        position,
548                    })?;
549                    let align = match op {
550                        '0'..='9' => op as u8 - b'0',
551                        'A'..='Z' => 10 + (op as u8 - b'A'),
552                        'a'..='z' => 10 + (op as u8 - b'a'),
553                        _ => {
554                            return Err(ParsePatError {
555                                kind: PatError::ReadOperand,
556                                position: next_pos,
557                            });
558                        }
559                    };
560                    result.push(Atom::Aligned(align));
561                }
562                '(' => {
563                    let alts = self.parse_group(position)?;
564                    result.append(&mut compile_alternatives(alts, position)?);
565                }
566                _ if ch.is_ascii_hexdigit() => {
567                    let (next_position, lo_ch) = self.bump().ok_or(ParsePatError {
568                        kind: PatError::UnpairedHexDigit,
569                        position,
570                    })?;
571                    if !lo_ch.is_ascii_hexdigit() {
572                        return Err(ParsePatError {
573                            kind: PatError::UnpairedHexDigit,
574                            position: next_position,
575                        });
576                    }
577                    let hi = hex_value(ch).expect("ascii hex already validated");
578                    let lo = hex_value(lo_ch).expect("ascii hex already validated");
579                    result.push(Atom::Byte((hi << 4) | lo));
580                }
581                _ => {
582                    return Err(ParsePatError {
583                        kind: PatError::UnknownChar,
584                        position,
585                    });
586                }
587            }
588
589            if result.len() > MAX_PATTERN_ATOMS {
590                return Err(ParsePatError {
591                    kind: PatError::PatternTooComplex,
592                    position,
593                });
594            }
595        }
596
597        Ok(result)
598    }
599
600    fn parse_group(&mut self, position: usize) -> Result<Vec<Vec<Atom>>, ParsePatError> {
601        let mut alternatives = Vec::new();
602        let group_save = self.save;
603        let group_depth = self.depth;
604        let mut max_save = self.save;
605        loop {
606            self.save = group_save;
607            self.depth = group_depth;
608            let seq = self.parse_sequence(&['|', ')'])?;
609            max_save = max_save.max(self.save);
610            alternatives.push(seq);
611            if alternatives.len() > MAX_GROUP_ALTERNATIVES {
612                return Err(ParsePatError {
613                    kind: PatError::PatternTooComplex,
614                    position,
615                });
616            }
617
618            let Some((stop_pos, stop)) = self.bump() else {
619                return Err(ParsePatError {
620                    kind: PatError::GroupOperand,
621                    position,
622                });
623            };
624
625            match stop {
626                '|' => continue,
627                ')' => break,
628                _ => {
629                    return Err(ParsePatError {
630                        kind: PatError::GroupOperand,
631                        position: stop_pos,
632                    });
633                }
634            }
635        }
636
637        if alternatives.is_empty() {
638            return Err(ParsePatError {
639                kind: PatError::GroupOperand,
640                position,
641            });
642        }
643
644        self.save = max_save;
645        self.depth = group_depth;
646
647        Ok(alternatives)
648    }
649
650    fn parse_skip_operand(
651        &mut self,
652        position: usize,
653        result: &mut Vec<Atom>,
654    ) -> Result<(), ParsePatError> {
655        let mut first: u32 = 0;
656        let mut second: u32 = 0;
657        let mut saw_first = false;
658        let mut saw_second = false;
659        let mut ranged = false;
660
661        while let Some((_, ch)) = self.bump() {
662            match ch {
663                '0'..='9' => {
664                    let digit = u32::from(ch as u8 - b'0');
665                    if !ranged {
666                        saw_first = true;
667                        first = first
668                            .checked_mul(10)
669                            .and_then(|n| n.checked_add(digit))
670                            .ok_or(ParsePatError {
671                                kind: PatError::SkipOperand,
672                                position,
673                            })?;
674                    } else {
675                        saw_second = true;
676                        second = second
677                            .checked_mul(10)
678                            .and_then(|n| n.checked_add(digit))
679                            .ok_or(ParsePatError {
680                                kind: PatError::SkipOperand,
681                                position,
682                            })?;
683                    }
684                }
685                '-' => {
686                    if ranged || !saw_first {
687                        return Err(ParsePatError {
688                            kind: PatError::SkipOperand,
689                            position,
690                        });
691                    }
692                    ranged = true;
693                }
694                ']' => {
695                    if !saw_first {
696                        return Err(ParsePatError {
697                            kind: PatError::SkipOperand,
698                            position,
699                        });
700                    }
701                    if !ranged {
702                        push_skip(result, first);
703                        return Ok(());
704                    }
705                    if !saw_second || second <= first {
706                        return Err(ParsePatError {
707                            kind: PatError::SkipOperand,
708                            position,
709                        });
710                    }
711                    let min = u16::try_from(first).map_err(|_| ParsePatError {
712                        kind: PatError::SkipOperand,
713                        position,
714                    })?;
715                    let max = u16::try_from(second - 1).map_err(|_| ParsePatError {
716                        kind: PatError::SkipOperand,
717                        position,
718                    })?;
719                    result.push(Atom::SkipRange(min, max));
720                    return Ok(());
721                }
722                _ => {
723                    return Err(ParsePatError {
724                        kind: PatError::SkipOperand,
725                        position,
726                    });
727                }
728            }
729        }
730
731        Err(ParsePatError {
732            kind: PatError::SkipOperand,
733            position,
734        })
735    }
736
737    fn peek(&self) -> Option<(usize, char)> {
738        self.chars.get(self.cursor).copied()
739    }
740
741    fn bump(&mut self) -> Option<(usize, char)> {
742        let value = self.peek()?;
743        self.cursor += 1;
744        Some(value)
745    }
746}
747
748fn compile_alternatives(
749    mut alts: Vec<Vec<Atom>>,
750    position: usize,
751) -> Result<Vec<Atom>, ParsePatError> {
752    debug_assert!(!alts.is_empty(), "alternatives parser guarantees non-empty");
753    if alts.len() == 1 {
754        let only = alts
755            .pop()
756            .expect("alternatives parser guarantees exactly one arm remains");
757        return Ok(only);
758    }
759
760    let first = alts.remove(0);
761    let rest = compile_alternatives(alts, position)?;
762    let case_skip = u16::try_from(first.len() + 2).map_err(|_| ParsePatError {
763        kind: PatError::PatternTooComplex,
764        position,
765    })?;
766    let break_skip = u16::try_from(rest.len()).map_err(|_| ParsePatError {
767        kind: PatError::PatternTooComplex,
768        position,
769    })?;
770
771    let mut out = Vec::with_capacity(2 + first.len() + rest.len());
772    out.push(Atom::Case(case_skip));
773    out.extend(first);
774    out.push(Atom::Break(break_skip));
775    out.extend(rest);
776    if out.len() > MAX_PATTERN_ATOMS {
777        return Err(ParsePatError {
778            kind: PatError::PatternTooComplex,
779            position,
780        });
781    }
782    Ok(out)
783}
784
785fn hex_value(ch: char) -> Option<u8> {
786    match ch {
787        '0'..='9' => Some(ch as u8 - b'0'),
788        'a'..='f' => Some(ch as u8 - b'a' + 10),
789        'A'..='F' => Some(ch as u8 - b'A' + 10),
790        _ => None,
791    }
792}
793
794fn push_skip(result: &mut Vec<Atom>, mut remaining: u32) {
795    while remaining != 0 {
796        let chunk = u8::try_from(remaining.min(u32::from(u8::MAX))).expect("chunk is bounded");
797        remaining -= u32::from(chunk);
798
799        if let Some(Atom::Skip(prev)) = result.last_mut() {
800            let free = u8::MAX - *prev;
801            if free != 0 {
802                let to_add = free.min(chunk);
803                *prev += to_add;
804                let leftover = chunk - to_add;
805                if leftover != 0 {
806                    result.push(Atom::Skip(leftover));
807                }
808                continue;
809            }
810        }
811
812        result.push(Atom::Skip(chunk));
813    }
814}
815
816#[cfg(test)]
817mod tests {
818    use proptest::prelude::*;
819
820    use super::{Atom, ParsePatError, PatError, parse};
821
822    #[test]
823    fn parses_used_subset() {
824        assert_eq!(
825            parse("488915${'} 488942"),
826            Ok(vec![
827                Atom::Save(0),
828                Atom::Byte(0x48),
829                Atom::Byte(0x89),
830                Atom::Byte(0x15),
831                Atom::Push(4),
832                Atom::Jump4,
833                Atom::Save(1),
834                Atom::Pop,
835                Atom::Byte(0x48),
836                Atom::Byte(0x89),
837                Atom::Byte(0x42),
838            ])
839        );
840
841        assert_eq!(
842            parse("68*'31c0c3"),
843            Ok(vec![
844                Atom::Save(0),
845                Atom::Byte(0x68),
846                Atom::Ptr,
847                Atom::Save(1),
848                Atom::Byte(0x31),
849                Atom::Byte(0xc0),
850                Atom::Byte(0xc3),
851            ])
852        );
853
854        assert_eq!(
855            parse("*{'90}"),
856            Ok(vec![
857                Atom::Save(0),
858                Atom::Push(0),
859                Atom::Ptr,
860                Atom::Save(1),
861                Atom::Byte(0x90),
862                Atom::Pop,
863            ])
864        );
865
866        assert_eq!(
867            parse("44 8B 81 u4 48 8D 0D"),
868            Ok(vec![
869                Atom::Save(0),
870                Atom::Byte(0x44),
871                Atom::Byte(0x8B),
872                Atom::Byte(0x81),
873                Atom::ReadU32(1),
874                Atom::Byte(0x48),
875                Atom::Byte(0x8D),
876                Atom::Byte(0x0D),
877            ])
878        );
879
880        assert_eq!(
881            parse("488b1d${'} 48891d[4] 4c63b3"),
882            Ok(vec![
883                Atom::Save(0),
884                Atom::Byte(0x48),
885                Atom::Byte(0x8b),
886                Atom::Byte(0x1d),
887                Atom::Push(4),
888                Atom::Jump4,
889                Atom::Save(1),
890                Atom::Pop,
891                Atom::Byte(0x48),
892                Atom::Byte(0x89),
893                Atom::Byte(0x1d),
894                Atom::Skip(4),
895                Atom::Byte(0x4c),
896                Atom::Byte(0x63),
897                Atom::Byte(0xb3),
898            ])
899        );
900
901        assert_eq!(
902            parse("\"hello\" 00"),
903            Ok(vec![
904                Atom::Save(0),
905                Atom::Byte(b'h'),
906                Atom::Byte(b'e'),
907                Atom::Byte(b'l'),
908                Atom::Byte(b'l'),
909                Atom::Byte(b'o'),
910                Atom::Byte(0x00),
911            ])
912        );
913    }
914
915    #[test]
916    fn reports_error_position() {
917        assert_eq!(
918            parse("4G") as Result<Vec<Atom>, ParsePatError>,
919            Err(ParsePatError {
920                kind: PatError::UnpairedHexDigit,
921                position: 1,
922            })
923        );
924
925        assert_eq!(
926            parse("u8") as Result<Vec<Atom>, ParsePatError>,
927            Err(ParsePatError {
928                kind: PatError::ReadOperand,
929                position: 0,
930            })
931        );
932
933        assert_eq!(
934            parse("\"unterminated") as Result<Vec<Atom>, ParsePatError>,
935            Err(ParsePatError {
936                kind: PatError::UnclosedQuote,
937                position: 0,
938            })
939        );
940    }
941
942    #[test]
943    fn group_alternatives_reset_and_merge_save_state() {
944        assert_eq!(
945            parse("('41|'42)'"),
946            Ok(vec![
947                Atom::Save(0),
948                Atom::Case(4),
949                Atom::Save(1),
950                Atom::Byte(0x41),
951                Atom::Break(2),
952                Atom::Save(1),
953                Atom::Byte(0x42),
954                Atom::Save(2),
955            ])
956        );
957    }
958
959    #[test]
960    fn range_skip_uses_strict_upper_bound() {
961        assert_eq!(
962            parse("[5-6]"),
963            Ok(vec![Atom::Save(0), Atom::SkipRange(5, 5)])
964        );
965
966        assert_eq!(
967            parse("[5-5]") as Result<Vec<Atom>, ParsePatError>,
968            Err(ParsePatError {
969                kind: PatError::SkipOperand,
970                position: 0,
971            })
972        );
973    }
974
975    #[test]
976    fn wildcard_semantics_match_pelite() {
977        assert_eq!(
978            parse("A?") as Result<Vec<Atom>, ParsePatError>,
979            Err(ParsePatError {
980                kind: PatError::UnpairedHexDigit,
981                position: 1,
982            })
983        );
984
985        assert_eq!(parse("?"), Ok(vec![Atom::Save(0)]));
986        assert_eq!(parse("??"), Ok(vec![Atom::Save(0)]));
987
988        assert_eq!(
989            parse("4183?03"),
990            Ok(vec![
991                Atom::Save(0),
992                Atom::Byte(0x41),
993                Atom::Byte(0x83),
994                Atom::Skip(1),
995                Atom::Byte(0x03),
996            ])
997        );
998    }
999
1000    #[test]
1001    fn supports_aligned_base36_syntax() {
1002        assert_eq!(parse("@4"), Ok(vec![Atom::Save(0), Atom::Aligned(4)]));
1003        assert_eq!(parse("@A"), Ok(vec![Atom::Save(0), Atom::Aligned(10)]));
1004        assert_eq!(parse("@f"), Ok(vec![Atom::Save(0), Atom::Aligned(15)]));
1005        assert_eq!(parse("@z"), Ok(vec![Atom::Save(0), Atom::Aligned(35)]));
1006
1007        assert_eq!(
1008            parse("@") as Result<Vec<Atom>, ParsePatError>,
1009            Err(ParsePatError {
1010                kind: PatError::ReadOperand,
1011                position: 0,
1012            })
1013        );
1014        assert_eq!(
1015            parse("@_") as Result<Vec<Atom>, ParsePatError>,
1016            Err(ParsePatError {
1017                kind: PatError::ReadOperand,
1018                position: 1,
1019            })
1020        );
1021        assert_eq!(
1022            parse("@?") as Result<Vec<Atom>, ParsePatError>,
1023            Err(ParsePatError {
1024                kind: PatError::ReadOperand,
1025                position: 1,
1026            })
1027        );
1028    }
1029
1030    #[test]
1031    fn save_len_counts_programmatic_pir_slots() {
1032        let pat = [Atom::Save(0), Atom::Pir(3)];
1033        assert_eq!(super::save_len(&pat), 4);
1034    }
1035
1036    proptest! {
1037        #[test]
1038        fn parsed_patterns_preserve_base_capture_and_slot_bounds(source in "[ -~]{0,128}") {
1039            if let Ok(atoms) = parse(&source) {
1040                prop_assert!(!atoms.is_empty());
1041                prop_assert_eq!(atoms[0], Atom::Save(0));
1042
1043                let required_slots = super::save_len(&atoms);
1044                prop_assert!(required_slots >= 1);
1045                prop_assert!(required_slots <= usize::from(u8::MAX));
1046            }
1047        }
1048    }
1049}