use combine::parser::byte;
use combine::parser::byte::byte;
use combine::*;
use std::fmt;
#[derive(Clone, PartialEq, Eq, Hash)]
pub(crate) enum RegExpr {
Sof,
Eof,
Char {
c: u8,
},
AnyChar,
Between {
from: u8,
to: u8,
},
Range {
cs: Vec<u8>,
},
Not {
not_re: Box<RegExpr>,
},
Either {
l_re: Box<RegExpr>,
r_re: Box<RegExpr>,
},
Optional {
opt_re: Box<RegExpr>,
},
Repeated {
repeat_re: Box<RegExpr>,
at_least: Option<usize>, at_most: Option<usize>, },
Seq {
re_xs: Vec<RegExpr>,
},
}
impl RegExpr {
fn case_insensitive(self) -> Self {
match self {
Self::Char { c } => Self::Range {
cs: case_insensitive(c),
},
Self::Not { not_re } => Self::Not {
not_re: Box::new(not_re.case_insensitive()),
},
Self::Either { l_re, r_re } => Self::Either {
l_re: Box::new(l_re.case_insensitive()),
r_re: Box::new(r_re.case_insensitive()),
},
Self::Optional { opt_re } => Self::Optional {
opt_re: Box::new(opt_re.case_insensitive()),
},
Self::Repeated {
repeat_re,
at_least,
at_most,
} => Self::Repeated {
repeat_re: Box::new(repeat_re.case_insensitive()),
at_least,
at_most,
},
Self::Seq { re_xs } => Self::Seq {
re_xs: re_xs.into_iter().map(|re| re.case_insensitive()).collect(),
},
_ => self,
}
}
}
fn case_insensitive(x: u8) -> Vec<u8> {
let c = u8_to_char(x);
if c.is_ascii_lowercase() {
return vec![x, c.to_ascii_uppercase() as u8];
}
if c.is_ascii_uppercase() {
return vec![x, c.to_ascii_lowercase() as u8];
}
vec![x]
}
pub(crate) fn u8_to_char(c: u8) -> char {
char::from_u32(c as u32).unwrap()
}
impl fmt::Debug for RegExpr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::Sof => write!(f, "^"),
Self::Eof => write!(f, "$"),
Self::Char { c } => write!(f, "{}", u8_to_char(*c)),
Self::AnyChar => write!(f, "."),
Self::Not { not_re } => {
write!(f, "[^")?;
not_re.fmt(f)?;
write!(f, "]")
}
Self::Between { from, to } => {
write!(f, "[{}->{}]", u8_to_char(*from), u8_to_char(*to),)
}
Self::Range { cs } => write!(
f,
"[{}]",
cs.iter().map(|c| u8_to_char(*c)).collect::<String>(),
),
Self::Either { l_re, r_re } => {
write!(f, "(")?;
l_re.fmt(f)?;
write!(f, "|")?;
r_re.fmt(f)?;
write!(f, ")")
}
Self::Repeated {
repeat_re,
at_least,
at_most,
} => {
let stringify_opt_n = |opt_n: &Option<usize>| -> String {
opt_n.map_or("*".to_string(), |n| format!("{n:?}"))
};
repeat_re.fmt(f)?;
write!(
f,
"{{{},{}}}",
stringify_opt_n(at_least),
stringify_opt_n(at_most)
)
}
Self::Optional { opt_re } => {
opt_re.fmt(f)?;
write!(f, "?")
}
Self::Seq { re_xs } => {
write!(f, "<")?;
for re_x in re_xs {
re_x.fmt(f)?;
}
write!(f, ">")?;
Ok(())
}
}
}
}
pub(crate) fn parse(pattern: &str) -> Result<RegExpr, Box<dyn std::error::Error>> {
let (parsed, unparsed) = (
between(
byte(b'/'),
byte(b'/'),
(optional(byte(b'^')), regex(), optional(byte(b'$'))),
)
.map(|(sof, re, eof)| {
if sof.is_none() && eof.is_none() {
return re;
}
let mut re_xs = vec![];
if sof.is_some() {
re_xs.push(RegExpr::Sof);
}
re_xs.push(re);
if eof.is_some() {
re_xs.push(RegExpr::Eof);
}
RegExpr::Seq { re_xs }
}),
optional(byte(b'i')),
)
.map(|(re, case_insensitive)| {
if case_insensitive.is_some() {
re.case_insensitive()
} else {
re
}
})
.parse(pattern.as_bytes())?;
if !unparsed.is_empty() {
return Err(format!(
"failed to parse regular expression, unexpected token at start of: {}",
std::str::from_utf8(unparsed).unwrap()
)
.into());
}
Ok(parsed)
}
parser! {
fn regex[Input]()(Input) -> RegExpr
where [Input: Stream<Token = u8>]
{
regex_()
}
}
fn regex_<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
choice((
attempt(
(term(), byte(b'|'), regex()).map(|(l_re, _, r_re)| RegExpr::Either {
l_re: Box::new(l_re),
r_re: Box::new(r_re),
}),
),
term(),
))
}
fn term<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
many(factor()).map(|re_xs: Vec<RegExpr>| {
if re_xs.len() == 1 {
re_xs[0].clone()
} else {
RegExpr::Seq { re_xs }
}
})
}
fn factor<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
choice((
attempt((atom(), byte(b'?'))).map(|(re, _)| RegExpr::Optional {
opt_re: Box::new(re),
}),
attempt(repeated()),
atom(),
))
}
const NON_ESCAPABLE_SYMBOLS: [u8; 14] = [
b'&', b';', b':', b',', b'`', b'~', b'-', b'_', b'!', b'@', b'#', b'%', b'\'', b'\"',
];
fn atom<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
choice((
byte(b'.').map(|_| RegExpr::AnyChar),
attempt(byte(b'\\').with(parser::token::any())).map(|c| RegExpr::Char { c }),
choice((
byte::alpha_num(),
parser::token::one_of(NON_ESCAPABLE_SYMBOLS),
))
.map(|c| RegExpr::Char { c }),
between(byte(b'['), byte(b']'), range()),
between(byte(b'('), byte(b')'), regex()),
))
}
parser! {
fn range[Input]()(Input) -> RegExpr
where [Input: Stream<Token = u8>]
{
range_()
}
}
fn range_<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
choice((
byte(b'^').with(range()).map(|re| RegExpr::Not {
not_re: Box::new(re),
}),
attempt(
(byte::alpha_num(), byte(b'-'), byte::alpha_num())
.map(|(from, _, to)| RegExpr::Between { from, to }),
),
many1(byte::alpha_num()).map(|cs| RegExpr::Range { cs }),
))
}
fn repeated<Input>() -> impl Parser<Input, Output = RegExpr>
where
Input: Stream<Token = u8>,
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
{
choice((
attempt((atom(), choice((byte(b'*'), byte(b'+'))))).map(|(re, c)| RegExpr::Repeated {
repeat_re: Box::new(re),
at_least: if c == b'*' { None } else { Some(1) },
at_most: None,
}),
attempt((
atom(),
between(byte(b'{'), byte(b'}'), many::<Vec<u8>, _, _>(byte::digit())),
))
.map(|(re, repeat_digits)| {
let repeat = parse_digits(&repeat_digits);
RegExpr::Repeated {
repeat_re: Box::new(re),
at_least: Some(repeat),
at_most: Some(repeat),
}
}),
(
atom(),
between(
byte(b'{'),
byte(b'}'),
(
many::<Vec<u8>, _, _>(byte::digit()),
byte(b','),
many::<Vec<u8>, _, _>(byte::digit()),
),
),
)
.map(
|(re, (at_least_digits, _, at_most_digits))| RegExpr::Repeated {
repeat_re: Box::new(re),
at_least: if at_least_digits.is_empty() {
None
} else {
Some(parse_digits(&at_least_digits))
},
at_most: if at_most_digits.is_empty() {
None
} else {
Some(parse_digits(&at_most_digits))
},
},
),
))
}
fn parse_digits(digits: &[u8]) -> usize {
std::str::from_utf8(digits).unwrap().parse().unwrap()
}
#[cfg(test)]
mod tests {
use crate::parser::{parse, RegExpr};
use test_case::test_case;
#[test_case("/h/", RegExpr::Char { c: b'h' }; "char")]
#[test_case("/&/", RegExpr::Char { c: b'&' }; "not necessary to escape ampersand")]
#[test_case("/;/", RegExpr::Char { c: b';' }; "not necessary to escape semicolon")]
#[test_case("/:/", RegExpr::Char { c: b':' }; "not necessary to escape colon")]
#[test_case("/,/", RegExpr::Char { c: b',' }; "not necessary to escape comma")]
#[test_case("/`/", RegExpr::Char { c: b'`' }; "not necessary to escape backtick")]
#[test_case("/~/", RegExpr::Char { c: b'~' }; "not necessary to escape tilde")]
#[test_case("/-/", RegExpr::Char { c: b'-' }; "not necessary to escape minus")]
#[test_case("/_/", RegExpr::Char { c: b'_' }; "not necessary to escape underscore")]
#[test_case("/%/", RegExpr::Char { c: b'%' }; "not necessary to escape percentage")]
#[test_case("/#/", RegExpr::Char { c: b'#' }; "not necessary to escape hashtag")]
#[test_case("/@/", RegExpr::Char { c: b'@' }; "not necessary to escape at")]
#[test_case("/!/", RegExpr::Char { c: b'!' }; "not necessary to escape exclamation")]
#[test_case("/'/", RegExpr::Char { c: b'\'' }; "not necessary to escape single quote")]
#[test_case("/\"/", RegExpr::Char { c: b'\"' }; "not necessary to escape double quote")]
#[test_case("/\\h/", RegExpr::Char { c: b'h' }; "anything can be escaped")]
#[test_case("/./", RegExpr::AnyChar; "any")]
#[test_case("/abc/",
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
RegExpr::Char { c: b'c' },
]};
"abc")]
#[test_case("/^abc/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
RegExpr::Char { c: b'c' },
]},
]};
"<sof>abc")]
#[test_case("/abc$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"abc<eof>")]
#[test_case("/^abc$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>abc<eof>")]
#[test_case("/^ab?c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Optional { opt_re: Box::new(RegExpr::Char { c: b'b' }) },
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<question>c<eof>")]
#[test_case("/^ab*c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
at_least: None,
at_most: None,
},
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<star>c<eof>")]
#[test_case("/^ab+c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
at_least: Some(1),
at_most: None,
},
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<plus>c<eof>")]
#[test_case("/^ab{2}c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
at_least: Some(2),
at_most: Some(2),
},
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<twice>c<eof>")]
#[test_case("/^ab{3,}c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
at_least: Some(3),
at_most: None,
},
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<atleast 3>c<eof>")]
#[test_case("/^ab{2,4}c$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
at_least: Some(2),
at_most: Some(4),
},
RegExpr::Char { c: b'c' },
]},
RegExpr::Eof,
]};
"<sof>ab<between 2 and 4>c<eof>")]
#[test_case("/^.$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::AnyChar,
RegExpr::Eof,
]};
"<sof><any><eof>")]
#[test_case("/^[abc]$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Range { cs: vec![b'a', b'b', b'c'] },
RegExpr::Eof,
]};
"<sof><a or b or c><eof>")]
#[test_case("/^[a-d]$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Between { from: b'a', to: b'd' },
RegExpr::Eof,
]};
"<sof><between a and d><eof>")]
#[test_case("/^[^abc]$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Not { not_re: Box::new(RegExpr::Range { cs: vec![b'a', b'b', b'c'] })},
RegExpr::Eof,
]};
"<sof><not <a or b or c>><eof>")]
#[test_case("/^[^a-d]$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Not { not_re: Box::new(RegExpr::Between { from: b'a', to: b'd' }) },
RegExpr::Eof,
]};
"<sof><not <between a and d>><eof>")]
#[test_case("/^abc$/i",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Range { cs: vec![b'a', b'A'] },
RegExpr::Range { cs: vec![b'b', b'B'] },
RegExpr::Range { cs: vec![b'c', b'C'] },
]},
RegExpr::Eof,
]};
"<sof>abc<eof> (case insensitive)")]
#[test_case("/^/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq { re_xs: vec![] }
]};
"sof")]
#[test_case("/$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Seq { re_xs: vec![] },
RegExpr::Eof
]};
"eof")]
#[test_case("/a*/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
at_least: None,
at_most: None,
};
"repeat unbounded (w/ *)")]
#[test_case("/a+/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
at_least: Some(1),
at_most: None,
};
"repeat bounded at least (w/ +)")]
#[test_case("/a{104,}/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
at_least: Some(104),
at_most: None,
};
"repeat bounded at least (w/ {x,}")]
#[test_case("/a{,15}/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
at_least: None,
at_most: Some(15),
};
"repeat bounded at most (w/ {,x}")]
#[test_case("/a{12,15}/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
at_least: Some(12),
at_most: Some(15),
};
"repeat bounded at least and at most (w/ {x,y}")]
#[test_case("/(a|b)*/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Either {
l_re: Box::new(RegExpr::Char { c: b'a' }),
r_re: Box::new(RegExpr::Char { c: b'b' }),
}),
at_least: None,
at_most: None,
};
"repeat complex unbounded")]
#[test_case("/(a|b){3,7}/",
RegExpr::Repeated {
repeat_re: Box::new(RegExpr::Either {
l_re: Box::new(RegExpr::Char { c: b'a' }),
r_re: Box::new(RegExpr::Char { c: b'b' }),
}),
at_least: Some(3),
at_most: Some(7),
};
"repeat complex bounded")]
#[test_case("/^ab|cd/",
RegExpr::Seq { re_xs: vec![
RegExpr::Sof,
RegExpr::Either {
l_re: Box::new(RegExpr::Seq { re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
] }),
r_re: Box::new(RegExpr::Seq { re_xs: vec![
RegExpr::Char { c: b'c' },
RegExpr::Char { c: b'd' },
]}),
},
]};
"Sof encapsulates full RHS")]
#[test_case("/ab|cd$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Either {
l_re: Box::new(RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
]}),
r_re: Box::new(RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'c' },
RegExpr::Char { c: b'd' },
]}),
},
RegExpr::Eof,
]};
"Eof encapsulates full RHS" )]
#[test_case("/^ab|cd$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Either {
l_re: Box::new(RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'b' },
]}),
r_re: Box::new(RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'c' },
RegExpr::Char { c: b'd' },
]}),
},
RegExpr::Eof,
]};
"Sof + Eof both encapsulate full center")]
#[test_case("/\\^/",
RegExpr::Char { c: b'^' };
"escaping sof symbol")]
#[test_case("/\\./",
RegExpr::Char { c: b'.' };
"escaping period symbol")]
#[test_case("/\\*/",
RegExpr::Char { c: b'*' };
"escaping star symbol")]
#[test_case("/^ca\\^b$/",
RegExpr::Seq {re_xs: vec![
RegExpr::Sof,
RegExpr::Seq {re_xs: vec![
RegExpr::Char { c: b'c' },
RegExpr::Char { c: b'a' },
RegExpr::Char { c: b'^' },
RegExpr::Char { c: b'b' },
]},
RegExpr::Eof,
]};
"escaping, more realistic")]
#[test_case("/8/",
RegExpr::Char { c: b'8' };
"able to match numbers")]
#[test_case("/[7-9]/",
RegExpr::Between { from: b'7', to: b'9' };
"able to match a number range")]
#[test_case("/[79]/",
RegExpr::Range { cs: vec![b'7', b'9'] };
"able to match a number range (part 2)")]
fn test_parser(pattern: &str, exp: RegExpr) {
match parse(pattern) {
Ok(got) => assert_eq!(exp, got),
Err(e) => panic!("got err: {}", e),
}
}
}