bparse/
pattern.rs

1// The Clone & Copy super traits allow patterns to be re-used even when a function returns `impl Pattern`
2/// Expresses that the implementing type may be used to match a byte slice.
3pub trait Pattern: Clone + Copy {
4    /// Evaluates the pattern against a byte slice.
5    /// If the pattern matches, the length of matching slice should be returned.
6    /// Otherwise, `None` should be returned.
7    ///
8    /// It is assumed that the pattern begins matching from the start of the slice.
9    fn eval(&self, input: &[u8]) -> Option<usize>;
10
11    /// Returns a new pattern that will match if `self` and `other` match in sequence.
12    ///
13    /// ```
14    /// use bparse::Pattern;
15    ///
16    /// let pattern = "a".then("b");
17    /// assert_eq!(pattern.eval(b"abc"), Some(2));
18    /// ```
19    fn then<P>(self, other: P) -> Sequence<Self, P>
20    where
21        Self: Sized,
22        P: Pattern,
23    {
24        Sequence {
25            pat1: self,
26            pat2: other,
27        }
28    }
29
30    /// Returns a new pattern that will match if either `self` or `other` match.
31    ///
32    /// ```
33    /// use bparse::Pattern;
34    ///
35    /// let pattern = "a".or("b");
36    /// assert_eq!(pattern.eval(b"ba"), Some(1));
37    /// ```
38    fn or<P>(self, other: P) -> Choice<Self, P>
39    where
40        Self: Sized,
41        P: Pattern,
42    {
43        Choice {
44            pat1: self,
45            pat2: other,
46        }
47    }
48}
49
50impl<P> Pattern for &P
51where
52    P: Pattern,
53{
54    fn eval(&self, input: &[u8]) -> Option<usize> {
55        (*self).eval(input)
56    }
57}
58
59/// See [`Pattern::or`]
60#[derive(Clone, Copy, Debug)]
61pub struct Choice<C1, C2> {
62    pat1: C1,
63    pat2: C2,
64}
65
66impl<P1, P2> Pattern for Choice<P1, P2>
67where
68    P1: Pattern,
69    P2: Pattern,
70{
71    fn eval(&self, input: &[u8]) -> Option<usize> {
72        match self.pat1.eval(input) {
73            Some(res) => Some(res),
74            None => self.pat2.eval(input),
75        }
76    }
77}
78
79/// See [`Pattern::then`]
80#[derive(Clone, Copy, Debug)]
81pub struct Sequence<P1, P2> {
82    pat1: P1,
83    pat2: P2,
84}
85
86impl<P1, P2> Pattern for Sequence<P1, P2>
87where
88    P1: Pattern,
89    P2: Pattern,
90{
91    fn eval(&self, input: &[u8]) -> Option<usize> {
92        if let Some(a) = self.pat1.eval(input) {
93            if let Some(b) = self.pat2.eval(&input[a..]) {
94                return Some(a + b);
95            }
96        }
97        None
98    }
99}
100
101/// Returns a new pattern that always matches the next byte in the input if it exists.
102///
103/// ```
104/// use bparse::{Pattern, byte};
105///
106/// assert_eq!(byte().eval(&[1, 2, 3]), Some(1));
107/// ```
108pub fn byte() -> One {
109    One
110}
111
112/// See [`one`]
113#[derive(Debug, Clone, Copy)]
114pub struct One;
115
116impl Pattern for One {
117    fn eval(&self, input: &[u8]) -> Option<usize> {
118        if input.is_empty() {
119            return None;
120        }
121
122        Some(1)
123    }
124}
125
126/// Returns a new pattern that matches the next utf-8 codepoint if it exists.
127///
128/// ```
129/// use bparse::{Pattern, codepoint};
130///
131/// assert_eq!(codepoint().eval("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆa".as_bytes()), Some(4))
132/// ```
133pub fn codepoint() -> Codepoint {
134    Codepoint
135}
136
137/// See [`codepoint`]
138#[derive(Debug, Clone, Copy)]
139pub struct Codepoint;
140
141impl Pattern for Codepoint {
142    fn eval(&self, input: &[u8]) -> Option<usize> {
143        // A codepoint can be at most 4 bytes long
144        let lookahead = std::cmp::min(input.len(), 4);
145        let chunk = String::from_utf8_lossy(&input[..lookahead]);
146        let size = match chunk.chars().next() {
147            Some(c) if c == char::REPLACEMENT_CHARACTER => return None,
148            None => return None,
149            Some(c) => c.len_utf8(),
150        };
151        Some(size)
152    }
153}
154
155/// Returns a new pattern that will match if `slice` is a prefix of the input.
156///
157/// ```
158/// use bparse::{Pattern, prefix};
159///
160/// let pattern = prefix(&[1]);
161/// assert_eq!(pattern.eval(&[1, 2, 3]), Some(1));
162/// ```
163///
164/// As a convenience, the `Pattern` trait is implemented for slices and bytes with the same effect:
165///
166/// ```
167/// use bparse::Pattern;
168///
169/// assert_eq!((&b"ab"[..]).eval(b"abc"), Some(2));
170/// ```
171///
172/// If you are working with ascii or utf8 byte slices, consider [`utf8`]
173pub fn prefix(slice: &[u8]) -> Prefix<'_> {
174    Prefix(slice)
175}
176
177/// Returns a new pattern that will match the utf8 string slice `s` at the start of the input.
178///
179/// ```
180/// use bparse::{Pattern, utf8};
181///
182/// let pattern = utf8("he");
183/// assert_eq!(pattern.eval(b"hello"), Some(2));
184/// ```
185///
186/// As a convenience, the `Pattern` trait is implemented for string slices with the same effect:
187///
188/// ```
189/// use bparse::Pattern;
190///
191/// assert_eq!("he".eval(b"hello"), Some(2));
192/// ```
193pub fn utf8(s: &str) -> Prefix<'_> {
194    Prefix(s.as_bytes())
195}
196
197impl Pattern for &str {
198    fn eval(&self, input: &[u8]) -> Option<usize> {
199        utf8(self).eval(input)
200    }
201}
202
203impl Pattern for &[u8] {
204    fn eval(&self, input: &[u8]) -> Option<usize> {
205        Prefix(self).eval(input)
206    }
207}
208
209impl Pattern for u8 {
210    fn eval(&self, input: &[u8]) -> Option<usize> {
211        Prefix(&[*self]).eval(input)
212    }
213}
214
215/// See [`prefix`]
216#[derive(Debug, Clone, Copy)]
217pub struct Prefix<'p>(&'p [u8]);
218
219impl Pattern for Prefix<'_> {
220    fn eval(&self, input: &[u8]) -> Option<usize> {
221        if !input.starts_with(self.0) {
222            return None;
223        }
224
225        Some(self.0.len())
226    }
227}
228
229/// Returns a pattern that will match any byte in `bytes` at the start of the input
230///
231/// ```
232/// use bparse::{Pattern, oneof};
233///
234/// let pattern = oneof(b"?!.:");
235/// assert_eq!(pattern.eval(b"hi"), None);
236/// assert_eq!(pattern.eval(b"!!"), Some(1));
237/// assert_eq!(pattern.eval(b":"), Some(1));
238/// ```
239pub fn oneof(bytes: &[u8]) -> ByteLookupTable {
240    let mut set: [bool; 256] = [false; 256];
241
242    let mut i = 0;
243    while i < bytes.len() {
244        set[bytes[i] as usize] = true;
245        i += 1;
246    }
247
248    ByteLookupTable(set)
249}
250
251/// Inverse of [`oneof`].
252pub fn noneof(bytes: &[u8]) -> ByteLookupTable {
253    let mut set = oneof(bytes).0;
254    let mut i = 0;
255    while i < set.len() {
256        set[i] = !set[i];
257        i += 1;
258    }
259
260    ByteLookupTable(set)
261}
262
263/// Returns a pattern that will match any ascii digit at the start of the input
264///
265/// ```
266/// use bparse::{Pattern, digit};
267///
268/// assert_eq!(digit().eval(b"a"), None);
269/// assert_eq!(digit().eval(b"8"), Some(1));
270/// ```
271pub fn digit() -> ByteLookupTable {
272    oneof(b"0123456789")
273}
274
275/// Returns a pattern that will match any ascii letter at the start of the input
276///
277/// ```
278/// use bparse::{Pattern, alpha};
279///
280/// assert_eq!(alpha().eval(b"1"), None);
281/// assert_eq!(alpha().eval(b"a"), Some(1));
282/// ```
283pub fn alpha() -> ByteLookupTable {
284    oneof(b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
285}
286
287/// Returns a pattern that will match any hexadecimal character at the start of the input
288///
289/// ```
290/// use bparse::{Pattern, hex};
291///
292/// assert_eq!(hex().eval(b"z"), None);
293/// assert_eq!(hex().eval(b"f"), Some(1));
294/// ```
295pub fn hex() -> Choice<ByteLookupTable, ByteLookupTable> {
296    oneof(b"abcdefABCDEF").or(digit())
297}
298
299/// See [`oneof`]
300#[derive(Debug, Clone, Copy)]
301pub struct ByteLookupTable([bool; 256]);
302
303impl Pattern for ByteLookupTable {
304    fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
305        let first = *input.first()?;
306
307        if self.0[first as usize] {
308            return Some(1);
309        }
310        None
311    }
312}
313/// Returns a new pattern that will match an element in the closed interval `[lo, hi]`
314///
315/// ```
316/// use bparse::{Pattern, range};
317///
318/// let pattern = range(b'a', b'z');
319/// assert_eq!(pattern.eval(b"b"), Some(1));
320/// ```
321pub fn range(lo: u8, hi: u8) -> ElementRange {
322    ElementRange(lo, hi)
323}
324
325/// See [`range()`]
326#[derive(Debug, Clone, Copy)]
327pub struct ElementRange(u8, u8);
328
329impl Pattern for ElementRange {
330    fn eval(&self, input: &[u8]) -> Option<usize> {
331        let first = input.first()?;
332
333        if first >= &self.0 && first <= &self.1 {
334            return Some(1);
335        }
336
337        None
338    }
339}
340
341/// Returns a new pattern that matches as many repetitions as possible of the given `pattern`, including 0.
342///
343/// ```
344/// use bparse::{Pattern, any};
345///
346/// assert_eq!(any("a").eval(b"aaa"), Some(3));
347/// assert_eq!(any("a").eval(b"aa"), Some(2));
348/// assert_eq!(any("a").eval(b""), Some(0));
349/// ```
350pub fn any<P>(pattern: P) -> Repetition<P>
351where
352    P: Pattern,
353{
354    Repetition {
355        lo: 0,
356        hi: None,
357        pattern,
358    }
359}
360
361/// Returns a new pattern that matches at least `n` repetitions of `pattern`.
362///
363/// ```
364/// use bparse::{Pattern, at_least};
365///
366/// assert_eq!(at_least(2, "a").eval(b"a"), None);
367/// assert_eq!(at_least(2, "a").eval(b"aaa"), Some(3));
368/// ```
369pub fn at_least<P>(n: usize, pattern: P) -> Repetition<P>
370where
371    P: Pattern,
372{
373    Repetition {
374        lo: n,
375        hi: None,
376        pattern,
377    }
378}
379
380/// Returns a new pattern that matches at most `n` repetitions of `pattern`.
381///
382/// ```
383/// use bparse::{Pattern, at_most};
384///
385/// assert_eq!(at_most(2, "b").eval(b"b"), Some(1));
386/// assert_eq!(at_most(2, "b").eval(b"bbbb"), Some(2));
387/// assert_eq!(at_most(2, "b").eval(b"aa"), Some(0));
388/// ```
389pub fn at_most<P>(n: usize, pattern: P) -> Repetition<P>
390where
391    P: Pattern,
392{
393    Repetition {
394        lo: 0,
395        hi: Some(n),
396        pattern,
397    }
398}
399
400/// Returns a new pattern that matches 0 or 1 repetitions of `pattern`
401///
402/// This implies the new pattern always matches the input.
403///
404/// ```
405/// use bparse::{Pattern, optional};
406///
407/// assert_eq!(optional("a").eval(b"b"), Some(0));
408/// assert_eq!(optional("a").eval(b"a"), Some(1));
409/// ```
410pub fn optional<P>(pattern: P) -> Repetition<P>
411where
412    P: Pattern,
413{
414    Repetition {
415        lo: 0,
416        hi: Some(1),
417        pattern,
418    }
419}
420
421/// Returns a new pattern that matches exactly `n` repetitions of `pattern`.
422///
423/// ```
424/// use bparse::{Pattern, count};
425///
426/// assert_eq!(count(2, "z").eval(b"zzz"), Some(2));
427/// assert_eq!(count(2, "z").eval(b"z"), None);
428/// ```
429pub fn count<P>(n: usize, pattern: P) -> Repetition<P>
430where
431    P: Pattern,
432{
433    Repetition {
434        lo: n,
435        hi: Some(n),
436        pattern,
437    }
438}
439
440/// Returns a new pattern that matches between `lo` and `hi` repetitions of `pattern`.
441///
442/// Both bounds are inclusive.
443pub fn between<P>(lo: usize, hi: usize, pattern: P) -> Repetition<P>
444where
445    P: Pattern,
446{
447    Repetition {
448        lo,
449        hi: Some(hi),
450        pattern,
451    }
452}
453
454/// Exppresses pattern repetition.
455///
456/// Many patterns (e.g. [`between`], [`optional`], [`at_least`]) are expressed in terms of this pattern.
457#[derive(Debug, Clone, Copy)]
458pub struct Repetition<P> {
459    pattern: P,
460    lo: usize,
461    hi: Option<usize>,
462}
463
464impl<P> Pattern for Repetition<P>
465where
466    P: Pattern,
467{
468    fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
469        let mut match_count = 0;
470        let mut offset = 0;
471
472        if let Some(upper_bound) = self.hi {
473            assert!(
474                upper_bound >= self.lo,
475                "upper bound should be greater than or equal to the lower bound"
476            );
477        };
478
479        loop {
480            // We hit the upper bound of pattern repetition
481            if let Some(upper_bound) = self.hi {
482                if match_count == upper_bound {
483                    return Some(offset);
484                }
485            };
486
487            let Some(value) = self.pattern.eval(&input[offset..]) else {
488                if match_count < self.lo {
489                    return None;
490                }
491                return Some(offset);
492            };
493
494            match_count += 1;
495            offset += value;
496        }
497    }
498}
499
500/// Returns a new pattern that matches only if `pattern` does not match.
501///
502/// This pattern returns `Some(0)` if the underlying pattern did not match which makes it function as a negative lookahead.
503pub fn not<P>(pattern: P) -> Not<P>
504where
505    P: Pattern,
506{
507    Not(pattern)
508}
509
510/// See [`not`]
511#[derive(Debug, Clone, Copy)]
512pub struct Not<P>(P);
513
514impl<P> Pattern for Not<P>
515where
516    P: Pattern,
517{
518    fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
519        if self.0.eval(input).is_some() {
520            return None;
521        }
522
523        Some(0)
524    }
525}
526
527/// Returns a pattern that matches the end of the slice.
528pub fn end() -> Eof {
529    Eof
530}
531
532/// See [`end`]
533#[derive(Debug, Clone, Copy)]
534pub struct Eof;
535
536impl Pattern for Eof {
537    fn eval(&self, input: &[u8]) -> Option<usize> {
538        input.is_empty().then_some(0)
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545
546    #[test]
547    fn match_prefix() {
548        // Pattern is empty, input is empty:
549        assert_eq!("".eval(b""), Some(0));
550        // Pattern is empty, input is not empty:
551        assert_eq!("".eval(b"aa"), Some(0));
552        // Pattern is not empty, input is empty:
553        assert_eq!("aa".eval(b""), None);
554        // Pattern is not empty, input length is less than pattern length:
555        assert_eq!("aa".eval(b"a"), None);
556        assert_eq!("aa".eval(b"b"), None);
557        // Pattern is not empty, input length is equal to pattern length:
558        assert_eq!("aa".eval(b"aa"), Some(2));
559        assert_eq!("aa".eval(b"bb"), None);
560        // Pattern is not empty, input length is greater than pattern length:
561        assert_eq!("aa".eval(b"aaa"), Some(2));
562        assert_eq!("aa".eval(b"bbb"), None);
563    }
564
565    #[test]
566    fn match_codepoint() {
567        assert_eq!(codepoint().eval(b"a"), Some(1));
568        assert_eq!(codepoint().eval("๐Ÿ‘ป".as_bytes()), Some(4));
569        assert_eq!(codepoint().eval(b"\xff"), None);
570    }
571
572    #[test]
573    fn match_range() {
574        // Range bounds are the same, input is empty:
575        assert_eq!(range(b'5', b'5').eval(b""), None);
576        // Range bounds are the same, input begins with a byte identical to bounds:
577        assert_eq!(range(b'5', b'5').eval(b"5"), Some(1));
578        // Range bounds are the same, input begins with outside bounds:
579        assert_eq!(range(b'5', b'5').eval(b"4"), None);
580        assert_eq!(range(b'5', b'5').eval(b"6"), None);
581        // Range bounds are different, input is empty:
582        assert_eq!(range(b'3', b'5').eval(b""), None);
583        // Range bounds are different, input begins with byte identical to bounds:
584        assert_eq!(range(b'3', b'5').eval(b"3"), Some(1));
585        assert_eq!(range(b'3', b'5').eval(b"5"), Some(1));
586        // Range bounds are different, input begins with byte within bounds:
587        assert_eq!(range(b'3', b'5').eval(b"4"), Some(1));
588        // Range bounds are different, input begins with byte outside bounds:
589        assert_eq!(range(b'3', b'5').eval(b"2"), None);
590        assert_eq!(range(b'3', b'5').eval(b"6"), None);
591    }
592
593    #[test]
594    fn match_oneof_noneof() {
595        // Set is empty, input is empty:
596        assert_eq!(oneof(b"").eval(b""), None);
597        assert_eq!(noneof(b"").eval(b""), None);
598
599        // Set is empty, input is not empty:
600        assert_eq!(oneof(b"").eval(b"a"), None);
601        assert_eq!(noneof(b"").eval(b"a"), Some(1));
602
603        // Set is not empty, input is empty:
604        assert_eq!(oneof(b"ab").eval(b""), None);
605        assert_eq!(noneof(b"ab").eval(b""), None);
606
607        // Set is not empty, input begins with byte in set
608        assert_eq!(oneof(b"ab").eval(b"a"), Some(1));
609        assert_eq!(oneof(b"ab").eval(b"b"), Some(1));
610
611        assert_eq!(noneof(b"ab").eval(b"a"), None);
612        assert_eq!(noneof(b"ab").eval(b"b"), None);
613
614        // Set is not empty, input begins with byte not in set
615        assert_eq!(oneof(b"ab").eval(b"c"), None);
616        assert_eq!(noneof(b"ab").eval(b"c"), Some(1));
617    }
618
619    #[test]
620    fn match_choice() {
621        // left matches, right matches:
622        assert_eq!("a".or("aa").eval(b"aa"), Some(1));
623        assert_eq!("aa".or("a").eval(b"aa"), Some(2));
624        // left matches, right does not match:
625        assert_eq!("a".or("b").eval(b"a"), Some(1));
626
627        // left does not match, right matches:
628        assert_eq!("b".or("a").eval(b"a"), Some(1));
629        // left does not match, right does not match:
630        assert_eq!("b".or("a").eval(b"c"), None);
631    }
632
633    #[test]
634    fn match_sequence() {
635        // left matches, right matches:
636        assert_eq!("a".then("b").eval(b"ab"), Some(2));
637        // left matches, right does not match:
638        assert_eq!("a".then("c").eval(b"ab"), None);
639
640        // left does not match, right matches:
641        assert_eq!("a".then("").eval(b"b"), None);
642        // left does not match, right does not match:
643        assert_eq!("a".then("c").eval(b"b"), None);
644    }
645
646    #[test]
647    fn patterns_are_reusable() {
648        let pattern = "a".then("b".or("c"));
649        assert_eq!(pattern.eval(b"ac"), Some(2));
650        assert_eq!(pattern.eval(b"ab"), Some(2));
651    }
652
653    #[test]
654    fn repetition_patterns() {
655        // repeats exactly 0 times
656        assert_eq!(count(0, "a").eval(b""), Some(0));
657        assert_eq!(count(0, "a").eval(b"a"), Some(0));
658        assert_eq!(count(0, "a").eval(b"aa"), Some(0));
659
660        // repeats exactly n times
661        assert_eq!(count(3, "a").eval(b""), None);
662        assert_eq!(count(3, "a").eval(b"aa"), None);
663        assert_eq!(count(3, "a").eval(b"aaa"), Some(3));
664        assert_eq!(count(3, "a").eval(b"aaaa"), Some(3));
665
666        // repeats any number of times
667        assert_eq!(any("a").eval(b""), Some(0));
668        assert_eq!(any("a").eval(b"b"), Some(0));
669        assert_eq!(any("a").eval(b"aa"), Some(2));
670        assert_eq!(any("a").eval(b"aab"), Some(2));
671
672        // repeats at least 0 times
673        assert_eq!(at_least(0, "a").eval(b""), Some(0));
674        assert_eq!(at_least(0, "a").eval(b"a"), Some(1));
675        assert_eq!(at_least(0, "a").eval(b"aaaa"), Some(4));
676
677        // repeats at least n times
678        assert_eq!(at_least(3, "a").eval(b""), None);
679        assert_eq!(at_least(3, "a").eval(b"aa"), None);
680        assert_eq!(at_least(3, "a").eval(b"aaa"), Some(3));
681        assert_eq!(at_least(3, "a").eval(b"aaaaa"), Some(5));
682
683        // repeats at most 0 times
684        assert_eq!(at_most(0, "a").eval(b""), Some(0));
685        assert_eq!(at_most(0, "a").eval(b"a"), Some(0));
686        assert_eq!(at_most(0, "a").eval(b"aa"), Some(0));
687
688        // repeats at most n times
689        assert_eq!(at_most(3, "a").eval(b""), Some(0));
690        assert_eq!(at_most(3, "a").eval(b"b"), Some(0));
691        assert_eq!(at_most(3, "a").eval(b"aa"), Some(2));
692        assert_eq!(at_most(3, "a").eval(b"aaa"), Some(3));
693        assert_eq!(at_most(3, "a").eval(b"aaaaa"), Some(3));
694
695        // repeats between n & m times
696        assert_eq!(between(2, 4, "a").eval(b""), None);
697        assert_eq!(between(2, 4, "a").eval(b"a"), None);
698        assert_eq!(between(2, 4, "a").eval(b"aa"), Some(2));
699        assert_eq!(between(2, 4, "a").eval(b"aaa"), Some(3));
700        assert_eq!(between(2, 4, "a").eval(b"aaaa"), Some(4));
701        assert_eq!(between(2, 4, "a").eval(b"aaaaa"), Some(4));
702    }
703}