pub trait Pattern: Clone + Copy {
fn eval(&self, input: &[u8]) -> Option<usize>;
fn then<P>(self, other: P) -> Sequence<Self, P>
where
Self: Sized,
P: Pattern,
{
Sequence {
pat1: self,
pat2: other,
}
}
fn or<P>(self, other: P) -> Choice<Self, P>
where
Self: Sized,
P: Pattern,
{
Choice {
pat1: self,
pat2: other,
}
}
}
impl<P> Pattern for &P
where
P: Pattern,
{
fn eval(&self, input: &[u8]) -> Option<usize> {
(*self).eval(input)
}
}
#[derive(Clone, Copy, Debug)]
pub struct Choice<C1, C2> {
pat1: C1,
pat2: C2,
}
impl<P1, P2> Pattern for Choice<P1, P2>
where
P1: Pattern,
P2: Pattern,
{
fn eval(&self, input: &[u8]) -> Option<usize> {
match self.pat1.eval(input) {
Some(res) => Some(res),
None => self.pat2.eval(input),
}
}
}
#[derive(Clone, Copy, Debug)]
pub struct Sequence<P1, P2> {
pat1: P1,
pat2: P2,
}
impl<P1, P2> Pattern for Sequence<P1, P2>
where
P1: Pattern,
P2: Pattern,
{
fn eval(&self, input: &[u8]) -> Option<usize> {
if let Some(a) = self.pat1.eval(input) {
if let Some(b) = self.pat2.eval(&input[a..]) {
return Some(a + b);
}
}
None
}
}
pub fn byte() -> One {
One
}
#[derive(Debug, Clone, Copy)]
pub struct One;
impl Pattern for One {
fn eval(&self, input: &[u8]) -> Option<usize> {
if input.is_empty() {
return None;
}
Some(1)
}
}
pub fn codepoint() -> Codepoint {
Codepoint
}
#[derive(Debug, Clone, Copy)]
pub struct Codepoint;
impl Pattern for Codepoint {
fn eval(&self, input: &[u8]) -> Option<usize> {
let lookahead = std::cmp::min(input.len(), 4);
let chunk = String::from_utf8_lossy(&input[..lookahead]);
let size = match chunk.chars().next() {
Some(c) if c == char::REPLACEMENT_CHARACTER => return None,
None => return None,
Some(c) => c.len_utf8(),
};
Some(size)
}
}
pub fn prefix(slice: &[u8]) -> Prefix<'_> {
Prefix(slice)
}
pub fn utf8(s: &str) -> Prefix<'_> {
Prefix(s.as_bytes())
}
impl Pattern for &str {
fn eval(&self, input: &[u8]) -> Option<usize> {
utf8(self).eval(input)
}
}
impl Pattern for &[u8] {
fn eval(&self, input: &[u8]) -> Option<usize> {
Prefix(self).eval(input)
}
}
impl Pattern for u8 {
fn eval(&self, input: &[u8]) -> Option<usize> {
Prefix(&[*self]).eval(input)
}
}
#[derive(Debug, Clone, Copy)]
pub struct Prefix<'p>(&'p [u8]);
impl Pattern for Prefix<'_> {
fn eval(&self, input: &[u8]) -> Option<usize> {
if !input.starts_with(self.0) {
return None;
}
Some(self.0.len())
}
}
pub fn oneof(bytes: &[u8]) -> ByteLookupTable {
let mut set: [bool; 256] = [false; 256];
let mut i = 0;
while i < bytes.len() {
set[bytes[i] as usize] = true;
i += 1;
}
ByteLookupTable(set)
}
pub fn noneof(bytes: &[u8]) -> ByteLookupTable {
let mut set = oneof(bytes).0;
let mut i = 0;
while i < set.len() {
set[i] = !set[i];
i += 1;
}
ByteLookupTable(set)
}
pub fn digit() -> ByteLookupTable {
oneof(b"0123456789")
}
pub fn alpha() -> ByteLookupTable {
oneof(b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
}
pub fn hex() -> Choice<ByteLookupTable, ByteLookupTable> {
oneof(b"abcdefABCDEF").or(digit())
}
#[derive(Debug, Clone, Copy)]
pub struct ByteLookupTable([bool; 256]);
impl Pattern for ByteLookupTable {
fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
let first = *input.first()?;
if self.0[first as usize] {
return Some(1);
}
None
}
}
pub fn range(lo: u8, hi: u8) -> ElementRange {
ElementRange(lo, hi)
}
#[derive(Debug, Clone, Copy)]
pub struct ElementRange(u8, u8);
impl Pattern for ElementRange {
fn eval(&self, input: &[u8]) -> Option<usize> {
let first = input.first()?;
if first >= &self.0 && first <= &self.1 {
return Some(1);
}
None
}
}
pub fn any<P>(pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo: 0,
hi: None,
pattern,
}
}
pub fn at_least<P>(n: usize, pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo: n,
hi: None,
pattern,
}
}
pub fn at_most<P>(n: usize, pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo: 0,
hi: Some(n),
pattern,
}
}
pub fn optional<P>(pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo: 0,
hi: Some(1),
pattern,
}
}
pub fn count<P>(n: usize, pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo: n,
hi: Some(n),
pattern,
}
}
pub fn between<P>(lo: usize, hi: usize, pattern: P) -> Repetition<P>
where
P: Pattern,
{
Repetition {
lo,
hi: Some(hi),
pattern,
}
}
#[derive(Debug, Clone, Copy)]
pub struct Repetition<P> {
pattern: P,
lo: usize,
hi: Option<usize>,
}
impl<P> Pattern for Repetition<P>
where
P: Pattern,
{
fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
let mut match_count = 0;
let mut offset = 0;
if let Some(upper_bound) = self.hi {
assert!(
upper_bound >= self.lo,
"upper bound should be greater than or equal to the lower bound"
);
};
loop {
if let Some(upper_bound) = self.hi {
if match_count == upper_bound {
return Some(offset);
}
};
let Some(value) = self.pattern.eval(&input[offset..]) else {
if match_count < self.lo {
return None;
}
return Some(offset);
};
match_count += 1;
offset += value;
}
}
}
pub fn not<P>(pattern: P) -> Not<P>
where
P: Pattern,
{
Not(pattern)
}
#[derive(Debug, Clone, Copy)]
pub struct Not<P>(P);
impl<P> Pattern for Not<P>
where
P: Pattern,
{
fn eval<'i>(&self, input: &[u8]) -> Option<usize> {
if self.0.eval(input).is_some() {
return None;
}
Some(0)
}
}
pub fn end() -> Eof {
Eof
}
#[derive(Debug, Clone, Copy)]
pub struct Eof;
impl Pattern for Eof {
fn eval(&self, input: &[u8]) -> Option<usize> {
input.is_empty().then_some(0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn match_prefix() {
assert_eq!("".eval(b""), Some(0));
assert_eq!("".eval(b"aa"), Some(0));
assert_eq!("aa".eval(b""), None);
assert_eq!("aa".eval(b"a"), None);
assert_eq!("aa".eval(b"b"), None);
assert_eq!("aa".eval(b"aa"), Some(2));
assert_eq!("aa".eval(b"bb"), None);
assert_eq!("aa".eval(b"aaa"), Some(2));
assert_eq!("aa".eval(b"bbb"), None);
}
#[test]
fn match_codepoint() {
assert_eq!(codepoint().eval(b"a"), Some(1));
assert_eq!(codepoint().eval("👻".as_bytes()), Some(4));
assert_eq!(codepoint().eval(b"\xff"), None);
}
#[test]
fn match_range() {
assert_eq!(range(b'5', b'5').eval(b""), None);
assert_eq!(range(b'5', b'5').eval(b"5"), Some(1));
assert_eq!(range(b'5', b'5').eval(b"4"), None);
assert_eq!(range(b'5', b'5').eval(b"6"), None);
assert_eq!(range(b'3', b'5').eval(b""), None);
assert_eq!(range(b'3', b'5').eval(b"3"), Some(1));
assert_eq!(range(b'3', b'5').eval(b"5"), Some(1));
assert_eq!(range(b'3', b'5').eval(b"4"), Some(1));
assert_eq!(range(b'3', b'5').eval(b"2"), None);
assert_eq!(range(b'3', b'5').eval(b"6"), None);
}
#[test]
fn match_oneof_noneof() {
assert_eq!(oneof(b"").eval(b""), None);
assert_eq!(noneof(b"").eval(b""), None);
assert_eq!(oneof(b"").eval(b"a"), None);
assert_eq!(noneof(b"").eval(b"a"), Some(1));
assert_eq!(oneof(b"ab").eval(b""), None);
assert_eq!(noneof(b"ab").eval(b""), None);
assert_eq!(oneof(b"ab").eval(b"a"), Some(1));
assert_eq!(oneof(b"ab").eval(b"b"), Some(1));
assert_eq!(noneof(b"ab").eval(b"a"), None);
assert_eq!(noneof(b"ab").eval(b"b"), None);
assert_eq!(oneof(b"ab").eval(b"c"), None);
assert_eq!(noneof(b"ab").eval(b"c"), Some(1));
}
#[test]
fn match_choice() {
assert_eq!("a".or("aa").eval(b"aa"), Some(1));
assert_eq!("aa".or("a").eval(b"aa"), Some(2));
assert_eq!("a".or("b").eval(b"a"), Some(1));
assert_eq!("b".or("a").eval(b"a"), Some(1));
assert_eq!("b".or("a").eval(b"c"), None);
}
#[test]
fn match_sequence() {
assert_eq!("a".then("b").eval(b"ab"), Some(2));
assert_eq!("a".then("c").eval(b"ab"), None);
assert_eq!("a".then("").eval(b"b"), None);
assert_eq!("a".then("c").eval(b"b"), None);
}
#[test]
fn patterns_are_reusable() {
let pattern = "a".then("b".or("c"));
assert_eq!(pattern.eval(b"ac"), Some(2));
assert_eq!(pattern.eval(b"ab"), Some(2));
}
#[test]
fn repetition_patterns() {
assert_eq!(count(0, "a").eval(b""), Some(0));
assert_eq!(count(0, "a").eval(b"a"), Some(0));
assert_eq!(count(0, "a").eval(b"aa"), Some(0));
assert_eq!(count(3, "a").eval(b""), None);
assert_eq!(count(3, "a").eval(b"aa"), None);
assert_eq!(count(3, "a").eval(b"aaa"), Some(3));
assert_eq!(count(3, "a").eval(b"aaaa"), Some(3));
assert_eq!(any("a").eval(b""), Some(0));
assert_eq!(any("a").eval(b"b"), Some(0));
assert_eq!(any("a").eval(b"aa"), Some(2));
assert_eq!(any("a").eval(b"aab"), Some(2));
assert_eq!(at_least(0, "a").eval(b""), Some(0));
assert_eq!(at_least(0, "a").eval(b"a"), Some(1));
assert_eq!(at_least(0, "a").eval(b"aaaa"), Some(4));
assert_eq!(at_least(3, "a").eval(b""), None);
assert_eq!(at_least(3, "a").eval(b"aa"), None);
assert_eq!(at_least(3, "a").eval(b"aaa"), Some(3));
assert_eq!(at_least(3, "a").eval(b"aaaaa"), Some(5));
assert_eq!(at_most(0, "a").eval(b""), Some(0));
assert_eq!(at_most(0, "a").eval(b"a"), Some(0));
assert_eq!(at_most(0, "a").eval(b"aa"), Some(0));
assert_eq!(at_most(3, "a").eval(b""), Some(0));
assert_eq!(at_most(3, "a").eval(b"b"), Some(0));
assert_eq!(at_most(3, "a").eval(b"aa"), Some(2));
assert_eq!(at_most(3, "a").eval(b"aaa"), Some(3));
assert_eq!(at_most(3, "a").eval(b"aaaaa"), Some(3));
assert_eq!(between(2, 4, "a").eval(b""), None);
assert_eq!(between(2, 4, "a").eval(b"a"), None);
assert_eq!(between(2, 4, "a").eval(b"aa"), Some(2));
assert_eq!(between(2, 4, "a").eval(b"aaa"), Some(3));
assert_eq!(between(2, 4, "a").eval(b"aaaa"), Some(4));
assert_eq!(between(2, 4, "a").eval(b"aaaaa"), Some(4));
}
}