regex-lite 0.1.9

A lightweight regex engine that optimizes for binary size and compilation time.
Documentation
use {
    anyhow::Result,
    regex_lite::{Regex, RegexBuilder},
    regex_test::{
        CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
    },
};

/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
    let mut runner = TestRunner::new()?;
    runner
        .expand(&["is_match", "find", "captures"], |test| test.compiles())
        .blacklist_iter(super::BLACKLIST)
        .test_iter(crate::suite()?.iter(), compiler)
        .assert();
    Ok(())
}

fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
    let hay = match std::str::from_utf8(test.haystack()) {
        Ok(hay) => hay,
        Err(err) => {
            return TestResult::fail(&format!(
                "haystack is not valid UTF-8: {err}",
            ));
        }
    };
    match test.additional_name() {
        "is_match" => TestResult::matched(re.is_match(hay)),
        "find" => TestResult::matches(
            re.find_iter(hay)
                .take(test.match_limit().unwrap_or(std::usize::MAX))
                .map(|m| Match {
                    id: 0,
                    span: Span { start: m.start(), end: m.end() },
                }),
        ),
        "captures" => {
            let it = re
                .captures_iter(hay)
                .take(test.match_limit().unwrap_or(std::usize::MAX))
                .map(|caps| testify_captures(&caps));
            TestResult::captures(it)
        }
        name => TestResult::fail(&format!("unrecognized test name: {name}")),
    }
}

/// Converts the given regex test to a closure that searches with a
/// `bytes::Regex`. If the test configuration is unsupported, then a
/// `CompiledRegex` that skips the test is returned.
fn compiler(
    test: &RegexTest,
    _patterns: &[String],
) -> anyhow::Result<CompiledRegex> {
    let Some(pattern) = skip_or_get_pattern(test) else {
        return Ok(CompiledRegex::skip());
    };
    let re = RegexBuilder::new(pattern)
        .case_insensitive(test.case_insensitive())
        .build()?;
    Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
}

/// Whether we should skip the given test or not. If not, return the single
/// pattern from the given test.
fn skip_or_get_pattern(test: &RegexTest) -> Option<&str> {
    // We're only testing Regex here, which supports one pattern only.
    let pattern = match test.regexes().len() {
        1 => &test.regexes()[0],
        _ => return None,
    };
    // If the test name contains 'regex-lite', then we ALWAYS run it. Because
    // those tests are specifically designed for regex-lite. So if they fail,
    // then something needs attention.
    if test.full_name().contains("regex-lite/") {
        return Some(pattern);
    }
    // If the pattern has a \p in it, then we almost certainly don't support
    // it. This probably skips more than we intend, but there are likely very
    // few tests that contain a \p that isn't also a Unicode class.
    if pattern.contains(r"\p") || pattern.contains(r"\P") {
        return None;
    }
    // Similar deal for Perl classes, but we can abide them if the haystack
    // is ASCII-only.
    if !test.haystack().is_ascii() {
        if pattern.contains(r"\d") || pattern.contains(r"\D") {
            return None;
        }
        if pattern.contains(r"\s") || pattern.contains(r"\S") {
            return None;
        }
        if pattern.contains(r"\w") || pattern.contains(r"\W") {
            return None;
        }
    }
    // And also same deal for word boundaries.
    if !test.haystack().is_ascii() {
        if pattern.contains(r"\b") || pattern.contains(r"\B") {
            return None;
        }
    }
    // We only test is_match, find_iter and captures_iter. All of those are
    // leftmost searches.
    if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
        return None;
    }
    // The top-level single-pattern regex API always uses leftmost-first.
    if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
        return None;
    }
    // The top-level regex API always runs unanchored searches. ... But we can
    // handle tests that are anchored but have only one match.
    if test.anchored() && test.match_limit() != Some(1) {
        return None;
    }
    // We don't support tests with explicit search bounds. We could probably
    // support this by using the 'find_at' (and such) APIs.
    let bounds = test.bounds();
    if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
        return None;
    }
    // The Regex API specifically does not support disabling UTF-8 mode because
    // it can only search &str which is always valid UTF-8.
    if !test.utf8() {
        return None;
    }
    // regex-lite doesn't support Unicode-aware case insensitive matching.
    if test.case_insensitive()
        && (!pattern.is_ascii() || !test.haystack().is_ascii())
    {
        return None;
    }
    Some(pattern)
}

/// Convert `Captures` into the test suite's capture values.
fn testify_captures(caps: &regex_lite::Captures<'_>) -> regex_test::Captures {
    let spans = caps.iter().map(|group| {
        group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
    });
    // This unwrap is OK because we assume our 'caps' represents a match, and
    // a match always gives a non-zero number of groups with the first group
    // being non-None.
    regex_test::Captures::new(0, spans).unwrap()
}