use super::*;
#[test]
fn byte_tier_contains_byte() {
let m = StrMatcher::new("x").unwrap();
assert_eq!(m.tier(), MatcherTier::Byte);
assert_eq!(m.reason(), "shape:contains-byte");
assert!(m.is_match(b"axc"));
assert!(!m.is_match(b"abc"));
}
#[test]
fn byte_tier_byte_set_2() {
let m = StrMatcher::new("[ab]").unwrap();
assert_eq!(m.tier(), MatcherTier::Byte);
assert_eq!(m.reason(), "shape:byte-set");
assert!(m.is_match(b"...a..."));
assert!(m.is_match(b"...b..."));
assert!(!m.is_match(b"...c..."));
}
#[test]
fn byte_tier_byte_set_3() {
let m = StrMatcher::new("[,\t|]").unwrap();
assert_eq!(m.tier(), MatcherTier::Byte);
assert!(m.is_match(b"foo,bar"));
assert!(m.is_match(b"foo\tbar"));
assert!(m.is_match(b"foo|bar"));
assert!(!m.is_match(b"foo;bar"));
}
#[test]
fn byte_tier_single_byte_anchored_start() {
let m = StrMatcher::new(r"^/").unwrap();
assert_eq!(m.tier(), MatcherTier::Byte);
assert!(m.is_match(b"/api"));
assert!(!m.is_match(b"api"));
}
#[test]
fn literal_tier_multi_byte_unanchored() {
let m = StrMatcher::new("foo").unwrap();
assert_eq!(m.tier(), MatcherTier::Literal);
assert_eq!(m.reason(), "shape:contains-literal");
assert!(m.is_match(b"xfoox"));
assert!(!m.is_match(b"xbarx"));
}
#[test]
fn literal_tier_starts_with_multi_byte() {
let m = StrMatcher::new(r"^/api/v1/").unwrap();
assert_eq!(m.tier(), MatcherTier::Literal);
assert_eq!(m.reason(), "shape:starts-with");
assert!(m.is_match(b"/api/v1/orders"));
assert!(!m.is_match(b"prefix/api/v1/orders"));
}
#[test]
fn literal_tier_ends_with_multi_byte() {
let m = StrMatcher::new(r"\.log$").unwrap();
assert_eq!(m.tier(), MatcherTier::Literal);
assert_eq!(m.reason(), "shape:ends-with");
assert!(m.is_match(b"server.log"));
assert!(!m.is_match(b"server.log.gz"));
}
#[test]
fn ends_with_does_not_panic_on_short_haystack() {
let m = StrMatcher::new(r"\.log$").unwrap();
assert!(!m.is_match(b""));
assert!(!m.is_match(b"a"));
assert!(!m.is_match(b"ab"));
assert!(!m.is_match(b"abc"));
assert!(m.is_match(b".log"));
assert!(m.is_match(b"x.log"));
}
#[test]
fn ends_with_find_does_not_panic_on_short_haystack() {
let m = StrMatcher::new(r"\.log$").unwrap();
assert!(m.find(b"").is_none());
assert!(m.find(b"ab").is_none());
assert_eq!(m.find(b".log").map(|h| h.start), Some(0));
}
#[test]
fn literal_tier_exact_match_multi_byte() {
let m = StrMatcher::new(r"^foo$").unwrap();
assert_eq!(m.tier(), MatcherTier::Literal);
assert_eq!(m.reason(), "shape:exact-match");
assert!(m.is_match(b"foo"));
assert!(!m.is_match(b"foobar"));
assert!(!m.is_match(b"barfoo"));
}
#[test]
fn literal_set_tier_alternation() {
let m = StrMatcher::new("AKIA|ghp_|sk_live_").unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"... AKIA1234 ..."));
assert!(m.is_match(b"github token: ghp_abcdef"));
assert!(m.is_match(b"sk_live_yes"));
assert!(!m.is_match(b"nothing here"));
}
#[test]
fn literal_set_tier_alternation_anchored() {
let m = StrMatcher::new(r"^(?:foo|bar|baz)").unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"foo123"));
assert!(m.is_match(b"baz9"));
assert!(!m.is_match(b"123foo"));
}
#[test]
fn literal_set_at_end_finds_later_match() {
let m = StrMatcher::new(r"(?:foo|bar|baz)$").unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"foo_bar"));
assert!(!m.is_match(b"foo_bar_baz_x"));
assert!(m.is_match(b"x_baz"));
}
#[test]
fn literal_set_exact_finds_full_haystack_match() {
let m = StrMatcher::new(r"^(?:foo|bar|baz)$").unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"foo"));
assert!(m.is_match(b"bar"));
assert!(m.is_match(b"baz"));
assert!(!m.is_match(b"foobar"));
assert!(!m.is_match(b"x_foo"));
assert!(!m.is_match(b"foo_x"));
}
#[test]
fn meta_word_boundary() {
let m = StrMatcher::new(r"\bfoo\b").unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
assert_eq!(m.reason(), "word-boundary");
assert!(m.is_match(b"this foo here"));
assert!(!m.is_match(b"unfooed"));
}
#[test]
fn meta_unbounded_quantifier() {
let m = StrMatcher::new(r"\w+@\w+").unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
assert_eq!(m.reason(), "unbounded-quantifier");
assert!(m.is_match(b"user@host"));
}
#[test]
fn meta_lookahead_rejected_at_parse() {
let err = StrMatcher::new(r"foo(?=bar)").unwrap_err();
match err {
BuildError::Syntax { .. } => {}
other => panic!("expected Syntax error, got {other:?}"),
}
}
#[test]
fn empty_pattern_is_error() {
let err = StrMatcher::new("").unwrap_err();
assert!(matches!(err, BuildError::Empty));
let msg = err.to_string();
assert!(msg.contains("hint:"), "missing hint in: {msg}");
}
mod equivalence {
use super::*;
fn assert_equivalent(pattern: &str, cases: &[&[u8]]) {
let m = StrMatcher::new(pattern).unwrap();
let r = regex::bytes::Regex::new(pattern).unwrap();
for hay in cases {
assert_eq!(
m.is_match(hay),
r.is_match(hay),
"divergence on pattern {pattern:?} haystack {hay:?} \
(strmatch tier: {})",
m.tier()
);
}
}
#[test]
fn shape_patterns_match_regex() {
let cases: &[&[u8]] = &[
b"",
b"a",
b"foo",
b"barfoo",
b"foobar",
b"this is foo here",
b"FOO",
b"\xff\xfe\xfd",
b"/api/v1/foo",
];
for pattern in [
"x",
"[ab]",
"[,\t|]",
"foo",
r"^/api/v1/",
r"\.log$",
r"^foo$",
] {
assert_equivalent(pattern, cases);
}
}
#[test]
fn literal_alternation_matches_regex() {
let cases: &[&[u8]] = &[
b"",
b"foo",
b"bar",
b"baz",
b"barbaz",
b"foobar",
b"qux",
b"nothing here",
b"AKIA1234",
b"ghp_abc",
];
for pattern in ["foo|bar|baz", "AKIA|ghp_|sk_live_", "^(?:foo|bar|baz)"] {
assert_equivalent(pattern, cases);
}
}
#[test]
fn meta_patterns_match_regex() {
let cases: &[&[u8]] = &[
b"",
b"foo",
b"user@host.example",
b"a@b",
b"nothing here",
b"the foo wins",
b"unfooed and unhappy",
];
for pattern in [r"\w+@\w+", r"\bfoo\b"] {
crate::strmatch::reset_warn_state_for_tests();
assert_equivalent(pattern, cases);
}
}
}
mod antispam {
use super::*;
fn capture<F: FnOnce()>(_f: F) {
}
#[test]
fn fallback_dedup_first_time_only() {
crate::strmatch::reset_warn_state_for_tests();
let _m1 = StrMatcher::new(r"\bfoo\b").unwrap();
let _m2 = StrMatcher::new(r"\bfoo\b").unwrap();
capture(|| {}); }
#[test]
fn tier_too_low_reject_returns_structured_error() {
crate::strmatch::reset_warn_state_for_tests();
let err = StrMatcher::builder()
.min_tier(MatcherTier::LiteralSet)
.on_below_min(OnBelowMin::Reject)
.build(r"\bfoo\b")
.unwrap_err();
match err {
BuildError::TierTooLow {
wanted,
got,
reason,
hint,
..
} => {
assert_eq!(wanted, MatcherTier::LiteralSet);
assert_eq!(got, MatcherTier::Regex);
assert_eq!(reason, "word-boundary");
assert!(!hint.is_empty());
}
other => panic!("expected TierTooLow, got {other:?}"),
}
}
#[test]
fn tier_too_low_warn_still_builds() {
crate::strmatch::reset_warn_state_for_tests();
let m = StrMatcher::builder()
.min_tier(MatcherTier::LiteralSet)
.on_below_min(OnBelowMin::Warn)
.build(r"\bfoo\b")
.unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
assert!(m.is_match(b"a foo b"));
}
#[test]
fn tier_too_low_allow_is_silent_by_default() {
crate::strmatch::reset_warn_state_for_tests();
let m = StrMatcher::builder()
.min_tier(MatcherTier::LiteralSet)
.on_below_min(OnBelowMin::Allow)
.build(r"\bfoo\b")
.unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
}
}
#[test]
fn set_tier_counts_match_per_pattern_classification() {
crate::strmatch::reset_warn_state_for_tests();
let set = StrMatcherSet::new([
r"^/api/", r"^/v2/", r"foo|bar", r"\bfoo\b", ])
.unwrap();
assert_eq!(set.tier_counts(), [0, 2, 1, 1]);
assert_eq!(set.len(), 4);
assert!(!set.is_empty());
}
#[test]
fn set_earliest_match_returns_lowest_start() {
let set = StrMatcherSet::new(["bar", "foo"]).unwrap();
let m = set.earliest_match(b"baz foo bar quux").unwrap();
assert_eq!(m.start, 4);
assert_eq!(m.end, 7);
assert_eq!(m.pattern_idx, 1); }
#[test]
fn set_is_match_short_circuits_on_first_hit() {
let set = StrMatcherSet::new(["impossible", "foo"]).unwrap();
assert!(set.is_match(b"contains foo"));
}
#[test]
fn ascii_case_insensitive_single_literal() {
let m = StrMatcher::builder()
.ascii_case_insensitive(true)
.build("foo")
.unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"FOO"));
assert!(m.is_match(b"Foo"));
assert!(m.is_match(b"foo"));
assert!(!m.is_match(b"bar"));
}
#[test]
fn ascii_case_insensitive_alternation() {
let m = StrMatcher::builder()
.ascii_case_insensitive(true)
.build("AKIA|GHP_")
.unwrap();
assert_eq!(m.tier(), MatcherTier::LiteralSet);
assert!(m.is_match(b"akia1234"));
assert!(m.is_match(b"ghp_abc"));
assert!(m.is_match(b"AKIA1234"));
}
#[test]
fn find_offsets_for_shape_tier() {
let m = StrMatcher::new("foo").unwrap();
let hit = m.find(b"...foo...").unwrap();
assert_eq!(hit.start, 3);
assert_eq!(hit.end, 6);
}
#[test]
fn find_offsets_for_literal_set() {
let m = StrMatcher::new("foo|barbaz").unwrap();
let hit = m.find(b"_barbaz_").unwrap();
assert_eq!(hit.start, 1);
assert_eq!(hit.end, 7);
}
#[test]
fn find_offsets_for_anchored_shape() {
let m = StrMatcher::new(r"^abc").unwrap();
let hit = m.find(b"abcxyz").unwrap();
assert_eq!(hit.start, 0);
assert_eq!(hit.end, 3);
assert!(m.find(b"_abcxyz").is_none());
}
#[test]
fn tier_rank_ordering_is_byte_literal_literalset_regex() {
assert!(MatcherTier::Byte.rank() > MatcherTier::Literal.rank());
assert!(MatcherTier::Literal.rank() > MatcherTier::LiteralSet.rank());
assert!(MatcherTier::LiteralSet.rank() > MatcherTier::Regex.rank());
assert_eq!(MatcherTier::Byte.rank(), 4);
assert_eq!(MatcherTier::Literal.rank(), 3);
assert_eq!(MatcherTier::LiteralSet.rank(), 2);
assert_eq!(MatcherTier::Regex.rank(), 1);
}
#[test]
fn typical_budget_ns_matches_documented_values() {
assert_eq!(MatcherTier::Byte.typical_budget_ns(), Some(30));
assert_eq!(MatcherTier::Literal.typical_budget_ns(), Some(200));
assert_eq!(MatcherTier::LiteralSet.typical_budget_ns(), Some(500));
assert_eq!(MatcherTier::Regex.typical_budget_ns(), None);
}
#[test]
fn matcher_tier_display_renders_pascal_case() {
assert_eq!(MatcherTier::Byte.to_string(), "Byte");
assert_eq!(MatcherTier::Literal.to_string(), "Literal");
assert_eq!(MatcherTier::LiteralSet.to_string(), "LiteralSet");
assert_eq!(MatcherTier::Regex.to_string(), "Regex");
}
#[test]
fn pattern_accessor_round_trips() {
let m = StrMatcher::new(r"^/api/v1/").unwrap();
assert_eq!(m.pattern(), "^/api/v1/");
}
#[test]
fn pattern_accessor_preserves_input_verbatim() {
let inputs = [r"^foo$", r"\bfoo\b", "AKIA|ghp_", "x"];
for &p in &inputs {
let m = StrMatcher::new(p).unwrap();
assert_eq!(m.pattern(), p, "pattern() should be the verbatim input");
}
}
#[test]
fn empty_haystack_against_every_tier() {
for pattern in ["x", "foo", r"^foo$", "foo|bar", r"\w+@\w+"] {
let m = StrMatcher::new(pattern).unwrap();
assert!(
!m.is_match(b""),
"pattern {pattern:?} matched empty haystack (tier {})",
m.tier(),
);
assert!(m.find(b"").is_none());
assert_eq!(m.find_iter(b"").count(), 0);
}
}
#[test]
fn single_byte_haystack_byte_tier() {
let m = StrMatcher::new("x").unwrap();
assert!(m.is_match(b"x"));
assert!(!m.is_match(b"y"));
let hit = m.find(b"x").unwrap();
assert_eq!(hit.start, 0);
assert_eq!(hit.end, 1);
}
#[test]
fn haystack_equals_pattern_in_exact_match() {
let m = StrMatcher::new(r"^foo$").unwrap();
assert!(m.is_match(b"foo"));
assert!(!m.is_match(b"foox"));
assert!(!m.is_match(b"xfoo"));
assert!(!m.is_match(b""));
}
#[test]
fn multi_byte_utf8_haystack_does_not_split_literals() {
let m = StrMatcher::new("f").unwrap();
let hit = m.find("café".as_bytes()).unwrap();
assert_eq!(hit.start, 2);
let m = StrMatcher::new("\u{e9}").unwrap(); assert!(m.tier() == MatcherTier::Byte || m.tier() == MatcherTier::Literal);
assert!(m.is_match("café".as_bytes()));
assert!(!m.is_match("cafe".as_bytes()));
}
#[test]
fn long_haystack_with_many_matches() {
let mut hay = Vec::with_capacity(4096);
for i in 0..64 {
hay.extend_from_slice(b"AKIA");
hay.extend_from_slice(&[b'.'; 60]);
let _ = i;
}
let m = StrMatcher::new("AKIA").unwrap();
assert_eq!(m.find_iter(&hay).count(), 64);
}
#[test]
fn meta_reason_multiline_anchor() {
let m = StrMatcher::new(r"(?m)^foo").unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
assert_eq!(m.reason(), "multiline-anchor");
}
#[test]
fn meta_reason_unicode_class() {
let m = StrMatcher::new(r"\w").unwrap();
assert_eq!(m.tier(), MatcherTier::Regex);
let r = m.reason();
assert!(
r == "unicode-class" || r == "character class with too many codepoints",
"unexpected reason: {r}",
);
}
#[test]
fn meta_reason_for_tier_too_low_error_includes_actionable_hint() {
let err = StrMatcher::builder()
.min_tier(MatcherTier::LiteralSet)
.on_below_min(OnBelowMin::Reject)
.build(r"(?m)^foo")
.unwrap_err();
match err {
BuildError::TierTooLow { hint, reason, .. } => {
assert_eq!(reason, "multiline-anchor");
assert!(hint.len() > 20, "hint should be actionable: {hint:?}");
assert!(
hint.contains("haystack") || hint.contains("line") || hint.contains("accept"),
"hint should suggest a fix: {hint:?}",
);
}
other => panic!("expected TierTooLow, got {other:?}"),
}
}
#[test]
fn min_tier_byte_rejects_literal_tier_pattern() {
let err = StrMatcher::builder()
.min_tier(MatcherTier::Byte)
.on_below_min(OnBelowMin::Reject)
.build("foo")
.unwrap_err();
match err {
BuildError::TierTooLow { wanted, got, .. } => {
assert_eq!(wanted, MatcherTier::Byte);
assert_eq!(got, MatcherTier::Literal);
}
other => panic!("expected TierTooLow, got {other:?}"),
}
}
#[test]
fn min_tier_byte_accepts_byte_tier_pattern() {
let m = StrMatcher::builder()
.min_tier(MatcherTier::Byte)
.on_below_min(OnBelowMin::Reject)
.build("x")
.unwrap();
assert_eq!(m.tier(), MatcherTier::Byte);
}
#[test]
fn min_tier_literal_set_rejects_only_regex_tier() {
let builder = StrMatcher::builder()
.min_tier(MatcherTier::LiteralSet)
.on_below_min(OnBelowMin::Reject);
assert!(builder.build("x").is_ok()); assert!(builder.build("foo").is_ok()); assert!(builder.build("foo|bar").is_ok()); assert!(builder.build(r"\bfoo\b").is_err()); }
#[test]
fn empty_set_construction() {
let set = StrMatcherSet::new(std::iter::empty::<&str>()).unwrap();
assert!(set.is_empty());
assert_eq!(set.len(), 0);
assert!(!set.is_match(b"anything"));
assert!(set.earliest_match(b"anything").is_none());
assert_eq!(set.find_iter(b"anything").count(), 0);
assert_eq!(set.tier_counts(), [0, 0, 0, 0]);
}
#[test]
fn single_pattern_set_behaves_like_single_matcher() {
let solo = StrMatcher::new("AKIA").unwrap();
let set = StrMatcherSet::new(["AKIA"]).unwrap();
let hays: &[&[u8]] = &[b"", b"AKIA", b"_AKIA_", b"akia", b"a long AKIA12345 line"];
for h in hays {
assert_eq!(solo.is_match(h), set.is_match(h), "divergence on {h:?}");
}
}
#[test]
fn find_iter_byte_tier_returns_every_position() {
let m = StrMatcher::new("x").unwrap();
let hits: Vec<Match> = m.find_iter(b"axbxcxd").collect();
assert_eq!(hits.len(), 3);
assert_eq!(hits[0].start, 1);
assert_eq!(hits[1].start, 3);
assert_eq!(hits[2].start, 5);
}
#[test]
fn find_iter_literal_tier_non_overlapping() {
let m = StrMatcher::new("aa").unwrap();
let hits: Vec<Match> = m.find_iter(b"aaaa").collect();
assert_eq!(hits.len(), 2);
assert_eq!(hits[0], Match { start: 0, end: 2 });
assert_eq!(hits[1], Match { start: 2, end: 4 });
}
#[test]
fn find_iter_anchored_yields_at_most_one() {
let m = StrMatcher::new(r"^foo").unwrap();
let hits: Vec<Match> = m.find_iter(b"foofoo").collect();
assert_eq!(hits.len(), 1);
assert_eq!(hits[0], Match { start: 0, end: 3 });
}
#[test]
fn find_iter_literal_set_yields_in_order() {
let m = StrMatcher::new("foo|bar").unwrap();
let hits: Vec<Match> = m.find_iter(b"X bar Y foo Z").collect();
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].start, 2);
assert_eq!(hits[1].start, 8);
}
#[test]
fn set_find_iter_merges_across_patterns_sorted_by_position() {
let set = StrMatcherSet::new(["foo", "bar", "baz"]).unwrap();
let hits: Vec<SetMatch> = set.find_iter(b"bar X foo Y baz").collect();
assert_eq!(hits.len(), 3);
assert_eq!(hits[0].start, 0);
assert_eq!(hits[0].pattern_idx, 1);
assert_eq!(hits[1].start, 6);
assert_eq!(hits[1].pattern_idx, 0);
assert_eq!(hits[2].start, 12);
assert_eq!(hits[2].pattern_idx, 2);
}
#[test]
fn set_merged_ac_pattern_indices_survive_across_merge_boundary() {
let patterns: Vec<String> = (0..50).map(|i| format!("tok{i:03}")).collect();
let set = StrMatcherSet::new(patterns.iter()).unwrap();
assert_eq!(set.len(), 50);
let hits: Vec<SetMatch> = set.find_iter(b"tok042 ... tok007").collect();
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].pattern_idx, 42);
assert_eq!(hits[1].pattern_idx, 7);
}
#[test]
fn set_merged_ac_alternation_patterns_map_to_one_input_index() {
let set = StrMatcherSet::new(["foo|bar|baz", "qux"]).unwrap();
let hits: Vec<SetMatch> = set.find_iter(b"--foo-- --qux-- --baz--").collect();
assert_eq!(hits.len(), 3);
assert_eq!(hits[0].pattern_idx, 0); assert_eq!(hits[1].pattern_idx, 1); assert_eq!(hits[2].pattern_idx, 0); }
#[test]
fn set_merged_and_individual_partitions_both_yield_correct_matches() {
let set = StrMatcherSet::new(["AKIA", "^/api/", "ghp_", "_END$"]).unwrap();
let hits: Vec<SetMatch> = set.find_iter(b"/api/foo AKIA1234 ghp_xyz _END").collect();
let idxs: Vec<usize> = hits.iter().map(|h| h.pattern_idx).collect();
assert!(idxs.contains(&0));
assert!(idxs.contains(&1));
assert!(idxs.contains(&2));
assert!(idxs.contains(&3));
let hits: Vec<SetMatch> = set
.find_iter(b"prefix /api/foo")
.filter(|h| h.pattern_idx == 1)
.collect();
assert!(hits.is_empty(), "^/api/ must not fire mid-string");
}
#[test]
fn set_merged_ac_leftmost_longest_returns_the_longer_literal() {
let set = StrMatcherSet::new(["AKIA", "AKIA1234"]).unwrap();
let hits: Vec<SetMatch> = set.find_iter(b"prefix AKIA1234 suffix").collect();
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].pattern_idx, 1);
assert_eq!(hits[0].end - hits[0].start, "AKIA1234".len());
}