use super::extract::extract_gating_substrings;
use aho_corasick::AhoCorasick;
#[test]
fn em_dash_prefix_extracts_correctly() {
let result = extract_gating_substrings("—password");
let subs = result.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1, "expected exactly one substring");
let (substring, ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94password",
"extracted substring should be the original UTF-8 bytes"
);
assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}
#[test]
fn em_dash_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("—password")
.expect("expected Some for plain literal");
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix —password suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match -- this is the soundness invariant the UTF-8 bug broke"
);
let m = &matches[0];
assert_eq!(
m.start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn case_insensitive_em_dash_prefix_extracts_correctly() {
let subs = extract_gating_substrings("(?i)—Password")
.expect("expected Some for case-insensitive literal");
assert_eq!(subs.len(), 1);
let (substring, ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94Password",
"extracted substring should preserve original UTF-8 bytes including the capital P"
);
assert!(*ci, "ci flag should be true after stripping (?i)");
}
#[test]
fn emoji_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("🔑secret")
.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xf0\x9f\x94\x91secret",
"extracted substring should preserve the original 4-byte emoji bytes"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix 🔑secret suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match for the 4-byte emoji prefix"
);
assert_eq!(
matches[0].start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn two_byte_utf8_prefix_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("étudiant")
.expect("expected Some for plain literal");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xc3\xa9tudiant",
"extracted substring should preserve the original 2-byte e-acute bytes"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "prefix étudiant suffix";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should find at least one match for the 2-byte e-acute prefix"
);
assert_eq!(
matches[0].start(),
7,
"match should start right after 'prefix ' (7 ASCII bytes)"
);
}
#[test]
fn anchor_prefix_extracts_after_strip() {
let subs = extract_gating_substrings("^—password")
.expect("expected Some after anchor strip");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(
substring.as_bytes(),
b"\xe2\x80\x94password",
"extracted substring should preserve em-dash bytes after `^` strip"
);
}
#[test]
fn short_non_ascii_prefix_rejected_by_min_prefix_len() {
let subs = extract_gating_substrings("—.*")
.expect("3-byte em-dash prefix should pass MIN_PREFIX_LEN");
assert_eq!(subs.len(), 1);
let (substring, _ci) = &subs[0];
assert_eq!(substring.as_bytes(), b"\xe2\x80\x94");
let result = extract_gating_substrings("é.*");
assert!(
result.is_none(),
"2-byte e-acute prefix is below MIN_PREFIX_LEN (bytes), should be None"
);
}
#[test]
fn alternation_with_non_ascii_extracts_both_branches() {
let subs = extract_gating_substrings("(?:—password|—token)")
.expect("expected Some for alternation of literals");
assert_eq!(
subs.len(),
2,
"expected one substring per alternation branch"
);
assert_eq!(
subs[0].0.as_bytes(),
b"\xe2\x80\x94password",
"first branch should be em-dash + password"
);
assert_eq!(
subs[1].0.as_bytes(),
b"\xe2\x80\x94token",
"second branch should be em-dash + token"
);
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let matches: Vec<_> = ac.find_iter("here is —token").collect();
assert!(
!matches.is_empty(),
"AC should fire on the second-branch literal"
);
}
#[test]
fn positive_lookahead_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?=foo)bar")
.expect("expected Some after lookahead skip");
assert_eq!(subs.len(), 1);
let (substring, ci) = &subs[0];
assert_eq!(substring.as_bytes(), b"bar");
assert!(!*ci, "ci flag should be false (no (?i) prefix)");
}
#[test]
fn negative_lookahead_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?!foo)bar")
.expect("expected Some after negative-lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn positive_lookbehind_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?<=foo)bar")
.expect("expected Some after positive-lookbehind skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn negative_lookbehind_at_start_extracts_after_body() {
let subs = extract_gating_substrings("(?<!foo)bar")
.expect("expected Some after negative-lookbehind skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"bar");
}
#[test]
fn lookahead_at_end_extracts_before_body() {
let subs = extract_gating_substrings("foobar(?=baz)")
.expect("expected Some with literal-then-lookahead");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"foobar");
}
#[test]
fn lookahead_in_middle_extracts_best_literal() {
let subs = extract_gating_substrings("foofoo(?=x)bar")
.expect("expected Some with literal-lookahead-literal");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b"foofoo",
"extract_branch should pick the longest of the two literals"
);
}
#[test]
fn lookahead_in_middle_picks_longer_after_skip() {
let subs = extract_gating_substrings("foo(?=x)barbaz")
.expect("expected Some after lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b"barbaz",
"post-fix walker should continue past lookahead and pick the longer trailing literal"
);
}
#[test]
fn prose_em_dash_pattern_extracts_middle_literal() {
let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
.expect("expected Some after lookbehind+lookahead skip");
assert_eq!(subs.len(), 1);
assert_eq!(
subs[0].0.as_bytes(),
b" -- ",
"literal between the two zero-width lookarounds should be the AC gate"
);
}
#[test]
fn nested_lookaround_extracts_after_outer() {
let subs = extract_gating_substrings("(?=(?:foo|bar))baz")
.expect("expected Some after nested-lookaround skip");
assert_eq!(subs.len(), 1);
assert_eq!(subs[0].0.as_bytes(), b"baz");
}
#[test]
fn lookahead_does_not_break_named_capture_path() {
let subs = extract_gating_substrings("(?<name>foo)bar")
.expect("named-capture rule should still gate");
let extracted_bytes = subs[0].0.as_bytes();
assert!(
extracted_bytes == b"foo" || extracted_bytes == b"foobar",
"named-capture body should still gate; got {:?}",
subs[0].0
);
}
#[test]
fn prose_em_dash_pattern_round_trips_through_aho_corasick() {
let subs = extract_gating_substrings("(?<=[a-z]) -- (?=[a-z])")
.expect("expected Some after both lookaround skips");
let patterns: Vec<&str> = subs.iter().map(|(s, _)| s.as_str()).collect();
let ac = AhoCorasick::new(&patterns).expect("AC build should succeed");
let content = "hello -- world";
let matches: Vec<_> = ac.find_iter(content).collect();
assert!(
!matches.is_empty(),
"AC should fire on ` -- ` for prose em-dash content"
);
assert_eq!(
matches[0].start(),
5,
"match should start at byte offset 5 (after `hello`)"
);
}