use super::extract::extract_gating_substrings;
use aho_corasick::AhoCorasick;
#[test]
fn em_dash_prefix_extracts_correctly() {
let result = extract_gating_substrings("—password");
let subs = result.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1, "expected exactly one substring");
let (substring, ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94password",
"extracted substring should be the original UTF-8 bytes"
);
assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}
#[test]
fn em_dash_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("—password")
.expect("expected Some for plain literal");
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix —password suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match -- this is the soundness invariant the UTF-8 bug broke"
);
let m = &matches[0];
assert_eq!(
m.start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn case_insensitive_em_dash_prefix_extracts_correctly() {
let subs = extract_gating_substrings("(?i)—Password")
.expect("expected Some for case-insensitive literal");
assert_eq!(subs.len(), 1);
let (substring, ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94Password",
"extracted substring should preserve original UTF-8 bytes including the capital P"
);
assert!(*ci, "ci flag should be true after stripping (?i)");
}
#[test]
fn emoji_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("🔑secret")
.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xf0\x9f\x94\x91secret",
"extracted substring should preserve the original 4-byte emoji bytes"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix 🔑secret suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match for the 4-byte emoji prefix"
);
assert_eq!(
matches[0].start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn two_byte_utf8_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("étudiant")
.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xc3\xa9tudiant",
"extracted substring should preserve the original 2-byte e-acute bytes"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix étudiant suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match for the 2-byte e-acute prefix"
);
assert_eq!(
matches[0].start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn anchor_prefix_extracts_after_strip() {
let subs = extract_gating_substrings("^—password")
.expect("expected Some after anchor strip");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94password",
"extracted substring should preserve em-dash bytes after `^` strip"
);
}
#[test]
fn short_non_ascii_prefix_rejected_by_min_prefix_len() {
let subs = extract_gating_substrings("—.*")
.expect("3-byte em-dash prefix should pass MIN_PREFIX_LEN");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(substring.as_bytes(), b"\xe2\x80\x94");
let result = extract_gating_substrings("é.*");
assert!(
result.is_none(),
"2-byte e-acute prefix is below MIN_PREFIX_LEN (bytes), should be None"
);
}
#[test]
fn alternation_with_non_ascii_extracts_both_branches() {
let subs = extract_gating_substrings("(?:—password|—token)")
.expect("expected Some for alternation of literals");
assert_eq!(
subs.len(),
2,
"expected one substring per alternation branch"
);
assert_eq!(
subs[0].0.as_bytes(),
b"\xe2\x80\x94password",
"first branch should be em-dash + password"
);
assert_eq!(
subs[1].0.as_bytes(),
b"\xe2\x80\x94token",
"second branch should be em-dash + token"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let matches: Vec<_> = ac.find_iter("here is —token").collect();
assert!(
!matches.is_empty(),
"AC should fire on the second-branch literal"
);
}
#[test]
fn positive_lookahead_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?=foo)bar")
.expect("expected Some after lookahead skip");
assert_eq!(subs.len(), 1);
let (substring, ci) = &subs[0];
assert_eq!(substring.as_bytes(), b"bar");
assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}
#[test]
fn negative_lookahead_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?!foo)bar")
.expect("expected Some after negative-lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn positive_lookbehind_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?<=foo)bar")
.expect("expected Some after positive-lookbehind skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn negative_lookbehind_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?<!foo)bar")
.expect("expected Some after negative-lookbehind skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn lookahead_at_end_extracts_before_body() {
let subs = extract_gating_substrings("foobar(?=baz)")
.expect("expected Some with literal-then-lookahead");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"foobar");
}
#[test]
fn lookahead_in_middle_extracts_best_literal() {
let subs = extract_gating_substrings("foofoo(?=x)bar")
.expect("expected Some with literal-lookahead-literal");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b"foofoo",
"extract_branch should pick the longest of the two literals"
);
}
#[test]
fn lookahead_in_middle_picks_longer_after_skip() {
let subs = extract_gating_substrings("foo(?=x)barbaz")
.expect("expected Some after lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b"barbaz",
"post-fix walker should continue past lookahead and pick the longer trailing literal"
);
}
#[test]
fn prose_em_dash_pattern_extracts_middle_literal() {
let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
.expect("expected Some after lookbehind+lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b" -- ",
"literal between the two zero-width lookarounds should be the AC gate"
);
}
#[test]
fn nested_lookaround_extracts_after_outer() {
let subs = extract_gating_substrings("(?=(?:foo|bar))baz")
.expect("expected Some after nested-lookaround skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"baz");
}
#[test]
fn lookahead_does_not_break_named_capture_path() {
let subs = extract_gating_substrings("(?<name>foo)bar")
.expect("named-capture rule should still gate");
let extracted_bytes = subs[0].0.as_bytes();
assert!(
extracted_bytes == b"foo" || extracted_bytes == b"foobar",
"named-capture body should still gate; got {:?}",
subs[0].0
);
}
#[test]
fn prose_em_dash_pattern_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
.expect("expected Some after both lookaround skips");
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "hello -- world";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should fire on ` -- ` for prose em-dash content"
);
assert_eq!(
matches[0].start(),
5,
"match should start at byte offset 5 (after `hello`)"
);
}
#[test]
fn inline_flag_propagates_ci_to_subsequent_literal() {
let subs = extract_gating_substrings("literalA(?i)keyword-suffix")
.expect("expected Some for literal + inline-flag + literal pattern");
assert_eq!(subs.len(), 1, "walker should pick a single best literal");
let (substring, ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"keyword-suffix",
"longer literal `keyword-suffix` (14 bytes) wins over `literalA` (8 bytes)"
);
assert!(
*ci,
"BUG 1: ci must be true after the inline (?i) flag; pre-fix this was false"
);
}
#[test]
fn unicode_flag_disables_extraction() {
assert!(
extract_gating_substrings("(?iu)cafésecret").is_none(),
"BUG 2: (?iu) leading flag must disable extraction"
);
assert!(
extract_gating_substrings("(?ui)cafésecret").is_none(),
"BUG 2: (?ui) leading flag must disable extraction"
);
assert!(
extract_gating_substrings("(?u)cafésecret").is_none(),
"BUG 2: (?u) leading flag must disable extraction (conservative)"
);
let subs = extract_gating_substrings("(?i)keyword-suffix")
.expect("plain (?i) without u flag must still extract");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"keyword-suffix");
assert!(subs[0].1, "ci should be true for plain (?i)");
}
#[test]
fn inline_negated_flag_clears_ci_for_subsequent_literal() {
let subs = extract_gating_substrings("(?i)shorty(?-i)keyword-suffix")
.expect("expected Some for outer (?i) + inline (?-i) + literal");
assert_eq!(subs.len(), 1);
let (substring, ci) = &subs[0];
assert_eq!(substring.as_bytes(), b"keyword-suffix");
assert!(
!*ci,
"BUG 1 (symmetric): inline (?-i) must clear the outer (?i) for subsequent literals"
);
}
#[test]
fn scoped_extended_flag_disables_body_extraction() {
assert!(
extract_gating_substrings("(?x:foo bar)").is_none(),
"BUG 9: scoped (?x:body) must not extract any substring"
);
let subs = extract_gating_substrings(r"required\_(?x:foo bar)")
.expect("outer literal must still extract even with (?x:body) after");
assert_eq!(subs.len(), 1, "expected exactly one substring (outer literal)");
assert_eq!(subs[0].0.as_bytes(), b"required_");
}
#[test]
fn bare_underscore_wildcard_does_not_appear_in_gate() {
let subs = extract_gating_substrings("pre_post")
.expect("expected Some -- some literal side of the wildcard must extract");
for (sub, _ci) in &subs {
assert!(
!sub.contains('_'),
"BUG 10: gating substring {:?} must not contain bare `_` (resharp wildcard)",
sub
);
}
let subs = extract_gating_substrings(r"pre\_post")
.expect("expected Some for escaped-underscore literal");
assert_eq!(subs.len(), 1, "expected one substring (the full literal)");
assert_eq!(
subs[0].0.as_bytes(),
b"pre_post",
"BUG 10 regression: escaped \\_ must keep the underscore as literal"
);
}