sscanf 0.5.0

A sscanf (inverse of format!()) macro with near unlimited parsing capabilities
Documentation
use super::{MatchTreeKind, MatchTreeTemplate};

use std::borrow::Cow;

use regex_syntax::hir::{Capture, Hir};

#[expect(unused_imports, reason = "for doc links")]
use crate::{FromScanf, advanced::Match};

/// A type for matching input strings.
///
/// This is the type returned by [`FromScanf::get_matcher`]. It matches parts of the input string to extract values.
/// A call to the `sscanf` macro then produces a [`Match`] corresponding to the matcher structure.
///
/// For example, the `Fraction` struct from the [`FromScanf`] docs matches inputs following
/// `"{numerator}/{denominator}"`. This becomes a `Matcher` instance like:
///
/// - A sequence ([`Matcher::Seq`]) of three parts:
///   - A matcher for the numerator field, generated by the `<numerator as FromScanf>::get_matcher` call
///   - A [`MatchPart::Literal`] for the `/` character
///   - A matcher for the denominator field, generated by the `<denominator as FromScanf>::get_matcher` call
///
/// Composing matchers creates a tree that matches input strings and routes extracted values to the right field
/// parsers. In this example, `Fraction`'s [`FromScanf::from_match`] receives a `Match` with child trees for
/// the numerator and denominator, which are then passed to their respective `from_match` methods.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum Matcher {
    /// A raw regex matcher. Created using [`Matcher::from_regex`].
    ///
    /// Parsed using [`Match::as_regex_matches`].
    Regex(RegexMatcher),
    /// Chain several parts together in sequence.
    ///
    /// Parsed using [`Match::as_seq`].
    Seq(Vec<MatchPart>),
    /// Combine several matchers in a way that only one of them can match at a time.
    ///
    /// Parsed using [`Match::as_alt`].
    Alt(Vec<Matcher>),
    /// An optional matcher.
    ///
    /// Parsed using [`Match::as_opt`].
    Optional(Box<Matcher>),
}

/// Implementation details of [`Matcher::Regex`].
#[derive(Debug, Clone)]
pub struct RegexMatcher {
    hir: Hir,
}

impl Matcher {
    /// Create a new matcher from a regex string.
    ///
    /// Use [`Match::as_regex_matches`] to extract the matches of any capture groups in the regex.
    pub fn from_regex(regex: impl AsRef<str>) -> Result<Self, String> {
        regex_syntax::parse(regex.as_ref())
            .map(|hir| Self::Regex(RegexMatcher { hir }))
            .map_err(|err| err.to_string())
    }

    /// Returns a new matcher that makes this matcher optional.
    ///
    /// Use [`Match::as_opt`] to extract the inner match tree if it was matched.
    pub fn optional(self) -> Self {
        Matcher::Optional(Box::new(self))
    }

    /// Internal constructor for a matcher from a raw HIR. Not public to avoid having a dependency in the public API.
    pub(crate) fn from_raw(hir: Hir) -> Self {
        Self::Regex(RegexMatcher { hir })
    }

    pub(crate) fn compile(self, capture_index: &mut usize) -> (Capture, MatchTreeTemplate) {
        let index = *capture_index;
        *capture_index += 1;
        let (hir, kind) = match self {
            Matcher::Regex(RegexMatcher { mut hir }) => {
                let start_index = *capture_index;
                compile_raw(&mut hir, capture_index);
                let end_index = *capture_index;
                (hir, MatchTreeKind::Regex(start_index..end_index))
            }
            Matcher::Seq(matchers) => {
                let mut hirs = vec![];
                let mut children = vec![];
                for matcher in matchers {
                    match matcher {
                        MatchPart::Matcher(matcher) => {
                            let (capture, child_index) = matcher.compile(capture_index);
                            hirs.push(Hir::capture(capture));
                            children.push(Some(child_index));
                        }
                        MatchPart::Regex(regex_part) => {
                            hirs.push(regex_part.hir);
                            children.push(None);
                        }
                        MatchPart::Literal(Cow::Owned(s)) => {
                            let hir = Hir::literal(s.into_bytes().into_boxed_slice());
                            hirs.push(hir);
                            children.push(None);
                        }
                        MatchPart::Literal(Cow::Borrowed(s)) => {
                            let hir = Hir::literal(s.as_bytes());
                            hirs.push(hir);
                            children.push(None);
                        }
                    }
                }
                (Hir::concat(hirs), MatchTreeKind::Seq(children))
            }
            Matcher::Alt(matchers) => {
                let (hirs, children) = matchers
                    .into_iter()
                    .map(|m| m.compile(capture_index))
                    .map(|(capture, child_index)| (Hir::capture(capture), child_index))
                    .unzip();
                (Hir::alternation(hirs), MatchTreeKind::Alt(children))
            }
            Matcher::Optional(matcher) => {
                let (capture, child_index) = matcher.compile(capture_index);
                let hir = Hir::repetition(regex_syntax::hir::Repetition {
                    min: 0,
                    max: Some(1),
                    greedy: true,
                    sub: Box::new(Hir::capture(capture)),
                });
                (hir, MatchTreeKind::Optional(Box::new(child_index)))
            }
        };
        let capture = Capture {
            index: u32::try_from(index).expect("capture index overflowed u32"),
            name: None,
            sub: Box::new(hir),
        };
        (capture, MatchTreeTemplate { index, kind })
    }

    /// Convert a matcher to a regex string.
    ///
    /// Note that this is an expensive operation and should only be used for debugging or testing purposes.
    ///
    /// Also note that the resulting regex might be different from a regex passed to [`Matcher::from_regex`] due to
    /// optimizations and transformations applied by the regex engine.
    ///
    /// Also also note that the output might change between versions in a semver-defying way, similar to the output of
    /// the `Debug` implementation.
    pub fn debug_to_regex(&self) -> String {
        let mut capture_index = 0;
        let (capture, _) = self.clone().compile(&mut capture_index);
        Hir::capture(capture).to_string()
    }
}

/// One component of e.g. a format string when converting it to a [`Matcher`].
#[derive(Debug, Clone)]
pub enum MatchPart {
    /// An inner matcher for fields etc.
    Matcher(Matcher),
    /// A regex string that should be matched. Must not contain any capture groups.
    Regex(RegexPart),
    /// A literal string that should be matched exactly.
    Literal(Cow<'static, str>),
}

/// Internal representation of [`MatchPart::Regex`].
#[derive(Debug, Clone)]
pub struct RegexPart {
    hir: Hir,
}

impl MatchPart {
    /// Create a [`MatchPart::Regex`] from a regex string.
    ///
    /// Any capture groups will be removed. If you need capture groups, use a [`MatchPart::Matcher`] containing a
    /// [`Matcher::from_regex`].
    pub fn regex(s: impl AsRef<str>) -> Result<Self, String> {
        regex_syntax::parse(s.as_ref())
            .map(|mut hir| {
                strip_captures(&mut hir);
                MatchPart::Regex(RegexPart { hir })
            })
            .map_err(|err| format!("sscanf: Invalid regex segment: {err}"))
    }
    /// Convenience method to create a [`MatchPart::Literal`] from a `String`, `&'static str`, or `Cow<str>`.
    pub fn literal(s: impl Into<Cow<'static, str>>) -> Self {
        MatchPart::Literal(s.into())
    }
}

impl From<Matcher> for MatchPart {
    fn from(matcher: Matcher) -> Self {
        MatchPart::Matcher(matcher)
    }
}

/// Re-assign capture indices in an existing HIR.
///
/// This is an (undocumented) flaw in the regex_syntax crate. It assumes that an HIR was created from a single call
/// to the parser, with continuous capture indices. But when composing HIRs from multiple sources like we do,
/// this assumption doesn't hold. So we need to re-assign the capture indices to ensure they follow the requirements.\
/// These are:
/// - Capture indices must be unique
/// - Capture indices must be in the range 1..=N where N is the number of capture groups
///
/// Within those constraints we could arrange them however we want, but the combination requires all indices to be
/// present, so it makes sense to just assign them sequentially.
fn compile_raw(hir: &mut Hir, capture_index: &mut usize) {
    if hir.properties().explicit_captures_len() == 0 {
        return; // No captures to process
    }
    let kind = std::mem::replace(hir, Hir::empty()).into_kind();
    use regex_syntax::hir::HirKind;
    match kind {
        HirKind::Capture(mut capture) => {
            capture.index = u32::try_from(*capture_index).expect("capture index overflowed u32");
            *capture_index += 1;

            capture.name = None; // We don't use named captures

            compile_raw(&mut capture.sub, capture_index);
            *hir = Hir::capture(capture);
        }
        HirKind::Repetition(mut repetition) => {
            compile_raw(&mut repetition.sub, capture_index);
            *hir = Hir::repetition(repetition);
        }
        HirKind::Concat(mut hirs) => {
            for sub_hir in &mut hirs {
                compile_raw(sub_hir, capture_index);
            }
            *hir = Hir::concat(hirs);
        }
        HirKind::Alternation(mut hirs) => {
            for sub_hir in &mut hirs {
                compile_raw(sub_hir, capture_index);
            }
            *hir = Hir::alternation(hirs);
        }
        HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {
            // None of these can contain any captures, but we checked at the start that there are captures present => contradiction
            unreachable!(
                r#"sscanf internal error: Encountered capture-free regex containing captures.
Please report this as a bug.
Offender: {kind:?}"#,
            );
        }
    }
}

/// Remove all capture groups from an HIR.
fn strip_captures(hir: &mut Hir) {
    if hir.properties().explicit_captures_len() == 0 {
        return; // Already no captures
    }
    let kind = std::mem::replace(hir, Hir::empty()).into_kind();
    use regex_syntax::hir::HirKind;
    match kind {
        HirKind::Capture(capture) => {
            let mut sub_hir = *capture.sub;
            strip_captures(&mut sub_hir);
            *hir = sub_hir;
        }

        HirKind::Repetition(mut repetition) => {
            strip_captures(&mut repetition.sub);
            *hir = Hir::repetition(repetition);
        }
        HirKind::Concat(mut hirs) => {
            for sub_hir in &mut hirs {
                strip_captures(sub_hir);
            }
            *hir = Hir::concat(hirs);
        }
        HirKind::Alternation(mut hirs) => {
            for sub_hir in &mut hirs {
                strip_captures(sub_hir);
            }
            *hir = Hir::alternation(hirs);
        }
        HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {
            // None of these can contain any captures, but we checked at the start that there are captures present => contradiction
            unreachable!(
                r#"sscanf internal error: Encountered capture-free regex containing captures.
Please report this as a bug.
Offender: {kind:?}"#,
            );
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_strip_captures() {
        let regex = "(a)|([b-d](e\\.f)?)";
        let mut hir = regex_syntax::parse(regex).unwrap();
        strip_captures(&mut hir);
        assert_eq!(hir.properties().explicit_captures_len(), 0);
        let hir_str = hir.to_string();
        assert_eq!(hir_str, "(?:a|(?:[b-d](?:e\\.f)?))");

        strip_captures(&mut hir); // Stripping again should do nothing
        assert_eq!(hir.properties().explicit_captures_len(), 0);
        assert_eq!(hir.to_string(), hir_str);
    }

    #[test]
    fn test_debug_to_regex() {
        let part_1 = "Value: ";
        let part_2 = r"([0-9]{1,3})";
        let part_3 = ", Flag: ";
        let part_4 = r"true|false";
        let matcher = Matcher::Seq(vec![
            MatchPart::literal(part_1),
            MatchPart::Matcher(Matcher::from_regex(part_2).unwrap()),
            MatchPart::literal(String::from(part_3)),
            MatchPart::Matcher(Matcher::from_regex(part_4).unwrap()),
        ]);

        let regex_str = matcher.debug_to_regex();
        assert_eq!(
            regex_str,
            "((?:(?:Value: )(([0-9]{1,3}))(?:, Flag: )((?:(?:true)|(?:false)))))"
        );

        let combined = format!("({part_1}({part_2}){part_3}({part_4}))"); // Any "Matcher" is wrapped in a capture group
        let direct = regex_syntax::parse(&combined).unwrap();
        assert_eq!(regex_str, direct.to_string());
    }
}