use super::{MatchTreeKind, MatchTreeTemplate};
use std::borrow::Cow;
use regex_syntax::hir::{Capture, Hir};
#[expect(unused_imports, reason = "for doc links")]
use crate::{FromScanf, advanced::Match};
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum Matcher {
Regex(RegexMatcher),
Seq(Vec<MatchPart>),
Alt(Vec<Matcher>),
Optional(Box<Matcher>),
}
#[derive(Debug, Clone)]
pub struct RegexMatcher {
hir: Hir,
}
impl Matcher {
pub fn from_regex(regex: impl AsRef<str>) -> Result<Self, String> {
regex_syntax::parse(regex.as_ref())
.map(|hir| Self::Regex(RegexMatcher { hir }))
.map_err(|err| err.to_string())
}
pub fn optional(self) -> Self {
Matcher::Optional(Box::new(self))
}
pub(crate) fn from_raw(hir: Hir) -> Self {
Self::Regex(RegexMatcher { hir })
}
pub(crate) fn compile(self, capture_index: &mut usize) -> (Capture, MatchTreeTemplate) {
let index = *capture_index;
*capture_index += 1;
let (hir, kind) = match self {
Matcher::Regex(RegexMatcher { mut hir }) => {
let start_index = *capture_index;
compile_raw(&mut hir, capture_index);
let end_index = *capture_index;
(hir, MatchTreeKind::Regex(start_index..end_index))
}
Matcher::Seq(matchers) => {
let mut hirs = vec![];
let mut children = vec![];
for matcher in matchers {
match matcher {
MatchPart::Matcher(matcher) => {
let (capture, child_index) = matcher.compile(capture_index);
hirs.push(Hir::capture(capture));
children.push(Some(child_index));
}
MatchPart::Regex(regex_part) => {
hirs.push(regex_part.hir);
children.push(None);
}
MatchPart::Literal(Cow::Owned(s)) => {
let hir = Hir::literal(s.into_bytes().into_boxed_slice());
hirs.push(hir);
children.push(None);
}
MatchPart::Literal(Cow::Borrowed(s)) => {
let hir = Hir::literal(s.as_bytes());
hirs.push(hir);
children.push(None);
}
}
}
(Hir::concat(hirs), MatchTreeKind::Seq(children))
}
Matcher::Alt(matchers) => {
let (hirs, children) = matchers
.into_iter()
.map(|m| m.compile(capture_index))
.map(|(capture, child_index)| (Hir::capture(capture), child_index))
.unzip();
(Hir::alternation(hirs), MatchTreeKind::Alt(children))
}
Matcher::Optional(matcher) => {
let (capture, child_index) = matcher.compile(capture_index);
let hir = Hir::repetition(regex_syntax::hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::capture(capture)),
});
(hir, MatchTreeKind::Optional(Box::new(child_index)))
}
};
let capture = Capture {
index: u32::try_from(index).expect("capture index overflowed u32"),
name: None,
sub: Box::new(hir),
};
(capture, MatchTreeTemplate { index, kind })
}
pub fn debug_to_regex(&self) -> String {
let mut capture_index = 0;
let (capture, _) = self.clone().compile(&mut capture_index);
Hir::capture(capture).to_string()
}
}
#[derive(Debug, Clone)]
pub enum MatchPart {
Matcher(Matcher),
Regex(RegexPart),
Literal(Cow<'static, str>),
}
#[derive(Debug, Clone)]
pub struct RegexPart {
hir: Hir,
}
impl MatchPart {
pub fn regex(s: impl AsRef<str>) -> Result<Self, String> {
regex_syntax::parse(s.as_ref())
.map(|mut hir| {
strip_captures(&mut hir);
MatchPart::Regex(RegexPart { hir })
})
.map_err(|err| format!("sscanf: Invalid regex segment: {err}"))
}
pub fn literal(s: impl Into<Cow<'static, str>>) -> Self {
MatchPart::Literal(s.into())
}
}
impl From<Matcher> for MatchPart {
fn from(matcher: Matcher) -> Self {
MatchPart::Matcher(matcher)
}
}
fn compile_raw(hir: &mut Hir, capture_index: &mut usize) {
if hir.properties().explicit_captures_len() == 0 {
return; }
let kind = std::mem::replace(hir, Hir::empty()).into_kind();
use regex_syntax::hir::HirKind;
match kind {
HirKind::Capture(mut capture) => {
capture.index = u32::try_from(*capture_index).expect("capture index overflowed u32");
*capture_index += 1;
capture.name = None;
compile_raw(&mut capture.sub, capture_index);
*hir = Hir::capture(capture);
}
HirKind::Repetition(mut repetition) => {
compile_raw(&mut repetition.sub, capture_index);
*hir = Hir::repetition(repetition);
}
HirKind::Concat(mut hirs) => {
for sub_hir in &mut hirs {
compile_raw(sub_hir, capture_index);
}
*hir = Hir::concat(hirs);
}
HirKind::Alternation(mut hirs) => {
for sub_hir in &mut hirs {
compile_raw(sub_hir, capture_index);
}
*hir = Hir::alternation(hirs);
}
HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {
unreachable!(
r#"sscanf internal error: Encountered capture-free regex containing captures.
Please report this as a bug.
Offender: {kind:?}"#,
);
}
}
}
fn strip_captures(hir: &mut Hir) {
if hir.properties().explicit_captures_len() == 0 {
return; }
let kind = std::mem::replace(hir, Hir::empty()).into_kind();
use regex_syntax::hir::HirKind;
match kind {
HirKind::Capture(capture) => {
let mut sub_hir = *capture.sub;
strip_captures(&mut sub_hir);
*hir = sub_hir;
}
HirKind::Repetition(mut repetition) => {
strip_captures(&mut repetition.sub);
*hir = Hir::repetition(repetition);
}
HirKind::Concat(mut hirs) => {
for sub_hir in &mut hirs {
strip_captures(sub_hir);
}
*hir = Hir::concat(hirs);
}
HirKind::Alternation(mut hirs) => {
for sub_hir in &mut hirs {
strip_captures(sub_hir);
}
*hir = Hir::alternation(hirs);
}
HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {
unreachable!(
r#"sscanf internal error: Encountered capture-free regex containing captures.
Please report this as a bug.
Offender: {kind:?}"#,
);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_captures() {
let regex = "(a)|([b-d](e\\.f)?)";
let mut hir = regex_syntax::parse(regex).unwrap();
strip_captures(&mut hir);
assert_eq!(hir.properties().explicit_captures_len(), 0);
let hir_str = hir.to_string();
assert_eq!(hir_str, "(?:a|(?:[b-d](?:e\\.f)?))");
strip_captures(&mut hir); assert_eq!(hir.properties().explicit_captures_len(), 0);
assert_eq!(hir.to_string(), hir_str);
}
#[test]
fn test_debug_to_regex() {
let part_1 = "Value: ";
let part_2 = r"([0-9]{1,3})";
let part_3 = ", Flag: ";
let part_4 = r"true|false";
let matcher = Matcher::Seq(vec![
MatchPart::literal(part_1),
MatchPart::Matcher(Matcher::from_regex(part_2).unwrap()),
MatchPart::literal(String::from(part_3)),
MatchPart::Matcher(Matcher::from_regex(part_4).unwrap()),
]);
let regex_str = matcher.debug_to_regex();
assert_eq!(
regex_str,
"((?:(?:Value: )(([0-9]{1,3}))(?:, Flag: )((?:(?:true)|(?:false)))))"
);
let combined = format!("({part_1}({part_2}){part_3}({part_4}))"); let direct = regex_syntax::parse(&combined).unwrap();
assert_eq!(regex_str, direct.to_string());
}
}