use crate::dfa::DFA;
use crate::mir::{Error as MirError, Mir, MirBuilder, SymbolOp};
use crate::nfa::NFA;
use crate::table::StateTransTable;
use regex_syntax::hir::Hir;
use regex_syntax::{parse, Error as RegexError, ParserBuilder};
use std::fmt;
use std::hash::Hash;
pub struct RegexBuilder<T> {
re_tags: Vec<(String, T)>,
enable_par: Option<bool>,
}
impl<T> RegexBuilder<T> {
pub fn new() -> Self {
Self {
re_tags: Vec::new(),
enable_par: None,
}
}
pub fn add(mut self, re: &str, tag: T) -> Self {
self.re_tags.push((re.into(), tag));
self
}
pub fn enable_par(mut self, enable_par: Option<bool>) -> Self {
self.enable_par = enable_par;
self
}
}
impl<T> RegexBuilder<T>
where
T: Clone + Hash + Eq + Ord,
{
pub fn build<S>(self) -> Result<RegexMatcher<S, T>, Error<T>>
where
S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send,
Mir<S, T>: MirBuilder,
{
self.build_impl(parse)
}
pub fn build_bytes<S>(self) -> Result<RegexMatcher<S, T>, Error<T>>
where
S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send,
Mir<S, T>: MirBuilder,
{
self.build_impl(|re| ParserBuilder::new().utf8(false).build().parse(re))
}
fn build_impl<R, S>(self, re_parse: R) -> Result<RegexMatcher<S, T>, Error<T>>
where
R: Fn(&str) -> Result<Hir, RegexError>,
S: Hash + Eq + Clone + Ord + SymbolOp + Sync + Send,
Mir<S, T>: MirBuilder,
{
if self.re_tags.is_empty() {
Err(Error::EmptyBuilder)
} else {
Mir::Alter(
self
.re_tags
.into_iter()
.map(|(re, tag)| {
re_parse(&re)
.map_err(|e| Error::Regex(Box::new(e), tag.clone()))
.and_then(|hir| Mir::new(hir).map_err(Error::Mir))
.map(|mir| (mir, Some(tag)))
})
.collect::<Result<_, _>>()?,
)
.optimize()
.map(|mir| {
RegexMatcher::new(StateTransTable::new(DFA::new(
NFA::new(mir),
self.enable_par,
)))
})
.map_err(Error::Mir)
}
}
}
impl<T> Default for RegexBuilder<T> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub enum Error<T> {
EmptyBuilder,
Regex(Box<RegexError>, T),
Mir(MirError),
}
impl<T> fmt::Display for Error<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::EmptyBuilder => write!(f, "no regular expressions in the builder"),
Self::Regex(e, _) => write!(f, "{e}"),
Self::Mir(e) => write!(f, "{e}"),
}
}
}
#[derive(Debug)]
pub struct RegexMatcher<S, T> {
table: StateTransTable<S, T>,
state: usize,
}
impl<S, T> RegexMatcher<S, T> {
fn new(table: StateTransTable<S, T>) -> Self {
Self {
state: table.init_id(),
table,
}
}
pub fn state(&self) -> usize {
self.state
}
pub fn is_match(&self, seq: &[S]) -> Option<&T>
where
S: Ord,
{
let mut id = self.table.init_id();
for s in seq {
if let Some(next) = self.table.next_state(id, s) {
id = next;
} else {
return None;
}
}
self.table.is_final(id)
}
pub fn is_accept(&mut self, s: &S) -> bool
where
S: Ord,
{
if let Some(next) = self.table.next_state(self.state, s) {
self.state = next;
true
} else {
false
}
}
pub fn is_final(&self) -> Option<&T> {
self.table.is_final(self.state)
}
pub fn is_state_final(&self, id: usize) -> Option<&T> {
self.table.is_final(id)
}
pub fn reset(&mut self) {
self.state = self.table.init_id();
}
}
impl<S, T> From<RegexMatcher<S, T>> for StateTransTable<S, T> {
fn from(matcher: RegexMatcher<S, T>) -> Self {
matcher.table
}
}
pub type CharsMatcher<T> = RegexMatcher<char, T>;
impl<T> CharsMatcher<T> {
pub fn is_str_match(&self, s: &str) -> Option<&T> {
let mut id = self.table.init_id();
for c in s.chars() {
if let Some(next) = self.table.next_state(id, &c) {
id = next;
} else {
return None;
}
}
self.table.is_final(id)
}
}
pub type BytesMatcher<T> = RegexMatcher<u8, T>;
impl<T> BytesMatcher<T> {
pub fn is_str_match(&self, s: &str) -> Option<&T> {
let mut id = self.table.init_id();
for c in s.bytes() {
if let Some(next) = self.table.next_state(id, &c) {
id = next;
} else {
return None;
}
}
self.table.is_final(id)
}
}
#[cfg(test)]
mod test {
use super::*;
use Token::*;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
enum Token {
Keyword,
Identifier,
Number,
Str,
Operator,
Skip,
Other,
}
#[test]
fn match_string() {
let matcher: CharsMatcher<_> = RegexBuilder::new()
.add("if|else|while", Keyword)
.add("[_a-zA-Z][_a-zA-Z0-9]*", Identifier)
.add("[0-9]|[1-9][0-9]+", Number)
.build()
.unwrap();
assert_eq!(matcher.is_str_match("if"), Some(&Keyword));
assert_eq!(matcher.is_str_match("else"), Some(&Keyword));
assert_eq!(matcher.is_str_match("while"), Some(&Keyword));
assert_eq!(matcher.is_str_match("ifi"), Some(&Identifier));
assert_eq!(matcher.is_str_match("else1"), Some(&Identifier));
assert_eq!(matcher.is_str_match("_while"), Some(&Identifier));
assert_eq!(matcher.is_str_match("a_8"), Some(&Identifier));
assert_eq!(matcher.is_str_match("_"), Some(&Identifier));
assert_eq!(matcher.is_str_match("A_good_id"), Some(&Identifier));
assert_eq!(matcher.is_str_match("A_b@d_id"), None);
assert_eq!(matcher.is_str_match("0"), Some(&Number));
assert_eq!(matcher.is_str_match("5"), Some(&Number));
assert_eq!(matcher.is_str_match("12450"), Some(&Number));
assert_eq!(matcher.is_str_match("10"), Some(&Number));
assert_eq!(matcher.is_str_match("01"), None);
assert_eq!(matcher.is_str_match(""), None);
assert_eq!(matcher.is_str_match("?"), None);
}
#[test]
fn match_bytes() {
let matcher: BytesMatcher<_> = RegexBuilder::new()
.add("hello|hi", 0)
.add("goodbye|bye", 1)
.build_bytes()
.unwrap();
assert_eq!(matcher.is_str_match("hello"), Some(&0));
assert_eq!(matcher.is_match(b"hello"), Some(&0));
assert_eq!(matcher.is_match(b"hi"), Some(&0));
assert_eq!(matcher.is_match(b"goodbye"), Some(&1));
assert_eq!(matcher.is_match(&[0x62, 0x79, 0x65]), Some(&1));
}
#[test]
fn match_stream() {
use std::io::{Cursor, Read};
struct Lexer<R> {
reader: R,
matcher: CharsMatcher<Token>,
last_char: Option<char>,
}
impl<R> Lexer<R> {
fn new(reader: R) -> Self {
Self {
reader,
matcher: RegexBuilder::new()
.add("if|else|while", Keyword)
.add("[_a-zA-Z][_a-zA-Z0-9]*", Identifier)
.add("[0-9]|[1-9][0-9]+", Number)
.add("\"[^\"\r\n]*\"", Str)
.add(r"==|>|-=|\+=", Operator)
.add(r"\s+", Skip)
.add(".", Other)
.build()
.unwrap(),
last_char: None,
}
}
fn unread(&mut self, c: char) {
self.last_char = Some(c);
}
}
impl<R> Lexer<R>
where
R: Read,
{
fn read(&mut self) -> Option<char> {
let mut buf = [0];
match self.last_char.take() {
None => match self.reader.read(&mut buf) {
Ok(1) => Some(buf[0] as char),
_ => None,
},
c => c,
}
}
fn next_token_impl(&mut self) -> Option<(Token, String)> {
let mut last_state;
let mut buf = String::new();
self.matcher.reset();
loop {
let c = self.read()?;
last_state = self.matcher.state();
if !self.matcher.is_accept(&c) {
self.unread(c);
break;
}
buf.push(c);
}
self.matcher.is_state_final(last_state).map(|t| (*t, buf))
}
fn next_token(&mut self) -> Option<(Token, String)> {
loop {
let ts = self.next_token_impl();
if !matches!(ts, Some((Skip, _))) {
return ts;
}
}
}
}
let mut lexer = Lexer::new(Cursor::new(
r#"
while (test(b) =="hello!") {
if (b> 5){
b-=1;
} else {
b += 2;
}
}
"#,
));
assert_eq!(lexer.next_token(), Some((Keyword, "while".into())));
assert_eq!(lexer.next_token(), Some((Other, "(".into())));
assert_eq!(lexer.next_token(), Some((Identifier, "test".into())));
assert_eq!(lexer.next_token(), Some((Other, "(".into())));
assert_eq!(lexer.next_token(), Some((Identifier, "b".into())));
assert_eq!(lexer.next_token(), Some((Other, ")".into())));
assert_eq!(lexer.next_token(), Some((Operator, "==".into())));
assert_eq!(lexer.next_token(), Some((Str, "\"hello!\"".into())));
assert_eq!(lexer.next_token(), Some((Other, ")".into())));
assert_eq!(lexer.next_token(), Some((Other, "{".into())));
assert_eq!(lexer.next_token(), Some((Keyword, "if".into())));
assert_eq!(lexer.next_token(), Some((Other, "(".into())));
assert_eq!(lexer.next_token(), Some((Identifier, "b".into())));
assert_eq!(lexer.next_token(), Some((Operator, ">".into())));
assert_eq!(lexer.next_token(), Some((Number, "5".into())));
assert_eq!(lexer.next_token(), Some((Other, ")".into())));
assert_eq!(lexer.next_token(), Some((Other, "{".into())));
assert_eq!(lexer.next_token(), Some((Identifier, "b".into())));
assert_eq!(lexer.next_token(), Some((Operator, "-=".into())));
assert_eq!(lexer.next_token(), Some((Number, "1".into())));
assert_eq!(lexer.next_token(), Some((Other, ";".into())));
assert_eq!(lexer.next_token(), Some((Other, "}".into())));
assert_eq!(lexer.next_token(), Some((Keyword, "else".into())));
assert_eq!(lexer.next_token(), Some((Other, "{".into())));
assert_eq!(lexer.next_token(), Some((Identifier, "b".into())));
assert_eq!(lexer.next_token(), Some((Operator, "+=".into())));
assert_eq!(lexer.next_token(), Some((Number, "2".into())));
assert_eq!(lexer.next_token(), Some((Other, ";".into())));
assert_eq!(lexer.next_token(), Some((Other, "}".into())));
assert_eq!(lexer.next_token(), Some((Other, "}".into())));
assert_eq!(lexer.next_token(), None);
}
#[test]
fn match_word() {
let matcher: CharsMatcher<_> = RegexBuilder::new().add(r"\w+", 0).build().unwrap();
assert_eq!(matcher.is_str_match("if"), Some(&0));
assert_eq!(matcher.is_str_match("hello"), Some(&0));
assert_eq!(matcher.is_str_match(".hello"), None);
assert_eq!(matcher.is_str_match("??"), None);
}
#[test]
fn match_xeno_tokens() {
let res = [
r"\s+",
r"(0b[01]+|0o[0-7]+|0x[0-9a-fA-F]+|[0-9]+)([iIuU](8|16|32|64))?",
r"[0-9]+(\.([0-9]+([eE][+-]?[0-9]+)?([fF](32|64))?)?|([eE][+-]?[0-9]+)([fF](32|64))?|([fF](32|64)))",
r#"'([^'\\\n\r\t]|\\'|\\"|\\x[0-7][0-9a-fA-F]|\\n|\\r|\\t|\\\\|\\0|\\u\{[0-9a-fA-F]{1,6}\})'"#,
r#"b'([\x20-\x26\x28-\x5b\x5d-\x7e]|\\x[0-9a-fA-F]{2}|\\n|\\r|\\t|\\\\|\\0|\\'|\\")'"#,
r#""([^'\\\n\r\t]|\\'|\\"|\\x[0-7][0-9a-fA-F]|\\n|\\r|\\t|\\\\|\\0|\\u\{[0-9a-fA-F]{1,6}\})*""#,
r####"r"[^"]*"|r#"([^"]|"[^#])*"#|r##"([^"]|"[^#]|"#[^#])*"##|r###"([^"]|"[^#]|"#[^#]|"##[^#])*"###"####,
r#"b"([\x20-\x26\x28-\x5b\x5d-\x7e]|\\x[0-9a-fA-F]{2}|\\n|\\r|\\t|\\\\|\\0|\\'|\\")*""#,
r"\+|-|\*|/|%|&|\||!|\^|<<|>>|&&|\|\||==|!=|<|<=|>|>=|=|\+=|-=|\*=|/=|%=|&=|\|=|\^=|<<=|>>=|\(|\)|\[|\]|\{|\}|\.|\.\.|\.\.\.|->|,|:|@|_|\?",
r"[~!@#$%^&*()_\-+={}\[\]|\\:;<,>.?/]+",
r#"[^\s~!@#$%^&*()_\-+={}\[\]|\\:;<,>.?/0-9][^\s~!@#$%^&*()\-+={}\[\]|\\:;<,>.?/]*"#,
];
let matcher: CharsMatcher<_> = res
.iter()
.enumerate()
.fold(RegexBuilder::new(), |b, (i, re)| b.add(re, i))
.build()
.unwrap();
assert_eq!(matcher.is_str_match("123"), Some(&1));
}
}