hyperi_rustlib/strmatch/mod.rs
1// Project: hyperi-rustlib
2// File: src/strmatch/mod.rs
3// Purpose: Public API for the strmatch regex→fast-path matcher
4// Language: Rust
5//
6// License: BUSL-1.1
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! Regex-shaped patterns, fast-path dispatch.
10//!
11//! Operators write a regex (the most familiar pattern language).
12//! `strmatch` classifies it into one of four tiers and dispatches at
13//! match time via the cheapest engine that's correct:
14//!
15//! - **Byte** (≤ 30 ns) -- direct byte ops: `memchr` / `memchr2` /
16//! `memchr3` / single-byte `starts_with` / `ends_with` / `==`.
17//! - **Literal** (≤ 200 ns) -- single multi-byte literal:
18//! `memmem::Finder` / multi-byte `starts_with` / `ends_with` / `==`.
19//! - **LiteralSet** (≤ 500 ns) -- `aho-corasick` over ≥ 2 literals
20//! (optional uniform anchor checked after the AC scan). Regex engine
21//! is never invoked.
22//! - **Regex** (engine-bounded) -- fall through to
23//! `regex-automata::meta::Regex`. Has its own internal prefilter
24//! pipeline; cost depends on pattern and haystack.
25//!
26//! Budgets are typical for a modern x86 server on a ~200-byte
27//! haystack; see [`MatcherTier::typical_budget_ns`] and
28//! `benches/strmatch.rs`.
29//!
30//! ## Anti-spam discipline
31//!
32//! When a pattern compiles to [`MatcherTier::Regex`] (the engine
33//! fall-back), `strmatch` emits **one** WARN per distinct pattern per
34//! process, capped at 10 distinct WARNs total. After the cap, further
35//! fall-through patterns log at DEBUG. A counter
36//! `hyperi_strmatch_regex_fallback_total` increments on every
37//! fall-through regardless of log level -- operators can scrape that
38//! rather than rely on logs.
39//!
40//! ## Quality gates
41//!
42//! Use [`StrMatcher::builder`] with [`StrMatcherBuilder::min_tier`] to
43//! reject (or loudly warn about) patterns that fall below an
44//! operator-chosen tier. Useful for hot-path configs where regex
45//! fall-through is unacceptable.
46//!
47//! ## Example
48//!
49//! ```
50//! use hyperi_rustlib::strmatch::{MatcherTier, OnBelowMin, StrMatcher};
51//!
52//! // Byte tier -- anchored single byte, dispatches to hay.first() == Some(b)
53//! let m = StrMatcher::new(r"^/")?;
54//! assert_eq!(m.tier(), MatcherTier::Byte);
55//! assert!(m.is_match(b"/api/v1/orders"));
56//!
57//! // Literal tier -- multi-byte literal, dispatches to memmem
58//! let m = StrMatcher::new(r"AKIA")?;
59//! assert_eq!(m.tier(), MatcherTier::Literal);
60//! assert!(m.is_match(b"... AKIA1234 ..."));
61//!
62//! // LiteralSet tier -- alternation, dispatches to AhoCorasick
63//! let m = StrMatcher::new(r"AKIA|ghp_|sk_live_")?;
64//! assert_eq!(m.tier(), MatcherTier::LiteralSet);
65//! assert!(m.is_match(b"github token: ghp_abcdef"));
66//!
67//! // Regex tier -- falls through to engine; refuse the build instead
68//! let err = StrMatcher::builder()
69//! .min_tier(MatcherTier::LiteralSet)
70//! .on_below_min(OnBelowMin::Reject)
71//! .build(r"\w+@\w+")
72//! .unwrap_err();
73//! assert!(err.to_string().contains("tier"));
74//! # Ok::<(), hyperi_rustlib::strmatch::BuildError>(())
75//! ```
76
77mod classify;
78mod plan;
79
80use std::collections::HashSet;
81use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
82use std::sync::{LazyLock, Mutex};
83
84use thiserror::Error;
85
86use classify::Classified;
87use plan::Plan;
88
89// ---------------------------------------------------------------------------
90// Public types
91// ---------------------------------------------------------------------------
92
93/// Which engine class a [`StrMatcher`] is dispatching to. Tiers are
94/// ordered by cost -- `Byte > Literal > LiteralSet > Regex` (higher
95/// means faster). Use [`Self::rank`] for `min_tier` comparisons.
96///
97/// Budgets below are typical for a modern x86 server on a ~200-byte
98/// haystack. Hardware-dependent; cold caches and pathological inputs
99/// move them around.
100#[non_exhaustive]
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
102pub enum MatcherTier {
103 /// **≤ 30 ns** -- direct byte ops: `memchr` / `memchr2/3` / single-byte
104 /// `starts_with` / `ends_with` / `==`.
105 Byte,
106 /// **≤ 200 ns** -- single multi-byte literal: `memmem::Finder` /
107 /// multi-byte `starts_with` / `ends_with` / `==`.
108 Literal,
109 /// **≤ 500 ns** -- `aho-corasick` over ≥ 2 literals (optional
110 /// uniform anchor checked after the AC scan).
111 LiteralSet,
112 /// **Bounded by the regex engine** -- `regex-automata::meta::Regex`
113 /// fall-through. Cost depends on pattern and haystack.
114 Regex,
115}
116
117impl std::fmt::Display for MatcherTier {
118 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119 match self {
120 Self::Byte => f.write_str("Byte"),
121 Self::Literal => f.write_str("Literal"),
122 Self::LiteralSet => f.write_str("LiteralSet"),
123 Self::Regex => f.write_str("Regex"),
124 }
125 }
126}
127
128impl MatcherTier {
129 /// Cost rank -- higher means faster. Useful for `min_tier`
130 /// comparisons without depending on the specific ns numbers.
131 ///
132 /// Ordering: `Byte (4) > Literal (3) > LiteralSet (2) > Regex (1)`.
133 #[must_use]
134 pub const fn rank(self) -> u8 {
135 match self {
136 Self::Byte => 4,
137 Self::Literal => 3,
138 Self::LiteralSet => 2,
139 Self::Regex => 1,
140 }
141 }
142
143 /// Indicative ns budget per `is_match` call on a ~200-byte
144 /// haystack (modern x86 server). `None` for [`Self::Regex`] since
145 /// the engine is unbounded.
146 ///
147 /// Numbers are estimates from benchmarks in `benches/strmatch.rs`;
148 /// real-world cost varies with haystack length, cache state, and
149 /// pattern complexity.
150 #[must_use]
151 pub const fn typical_budget_ns(self) -> Option<u64> {
152 match self {
153 Self::Byte => Some(30),
154 Self::Literal => Some(200),
155 Self::LiteralSet => Some(500),
156 Self::Regex => None,
157 }
158 }
159}
160
161/// Byte offsets of a match. End is exclusive: `&hay[start..end]` is the
162/// matched slice.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164#[non_exhaustive]
165pub struct Match {
166 pub start: usize,
167 pub end: usize,
168}
169
170/// Like [`Match`] but also identifies which input pattern matched in a
171/// [`StrMatcherSet`].
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173#[non_exhaustive]
174pub struct SetMatch {
175 pub start: usize,
176 pub end: usize,
177 pub pattern_idx: usize,
178}
179
180/// Failure modes during construction.
181#[derive(Debug, Error)]
182#[non_exhaustive]
183pub enum BuildError {
184 /// The pattern is empty.
185 #[error(
186 "strmatch: empty pattern\n \
187 hint: an empty pattern matches every position; pass a non-empty \
188 literal or wrap the matcher in Option for an absent-matcher slot"
189 )]
190 Empty,
191
192 /// The regex parser rejected the pattern.
193 #[error(
194 "strmatch: regex syntax error in pattern {pattern:?}\n \
195 reason: {source}\n \
196 hint: {hint}"
197 )]
198 Syntax {
199 pattern: String,
200 #[source]
201 source: Box<regex_syntax::Error>,
202 hint: &'static str,
203 },
204
205 /// The pattern compiled to a tier below the builder's minimum.
206 #[error(
207 "strmatch: pattern {pattern:?} compiles to tier {got}, but builder \
208 requires at least {wanted}\n \
209 reason: {reason}\n \
210 hint: {hint}"
211 )]
212 TierTooLow {
213 pattern: String,
214 wanted: MatcherTier,
215 got: MatcherTier,
216 reason: &'static str,
217 hint: &'static str,
218 },
219}
220
221/// What to do when a pattern's classification falls below the
222/// builder's `min_tier`.
223#[non_exhaustive]
224#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
225pub enum OnBelowMin {
226 /// Default -- proceed quietly. The anti-spam protocol still emits
227 /// up to ten WARNs per process for fall-through patterns.
228 #[default]
229 Allow,
230 /// Proceed, but force an immediate WARN that bypasses the anti-spam
231 /// cap. Use when the caller explicitly wants every fall-through
232 /// logged regardless of process-wide dedup state.
233 Warn,
234 /// Refuse to build. Returns [`BuildError::TierTooLow`].
235 Reject,
236}
237
238// ---------------------------------------------------------------------------
239// StrMatcher
240// ---------------------------------------------------------------------------
241
242/// Compiled pattern with tier-aware dispatch.
243pub struct StrMatcher {
244 plan: Plan,
245 tier: MatcherTier,
246 pattern: String,
247 /// Short machine-readable reason (e.g. `"shape:starts-with"`,
248 /// `"literal-only:alternation"`, `"unbounded-quantifier"`).
249 reason: &'static str,
250}
251
252impl std::fmt::Debug for StrMatcher {
253 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254 f.debug_struct("StrMatcher")
255 .field("tier", &self.tier)
256 .field("pattern", &self.pattern)
257 .field("reason", &self.reason)
258 .finish_non_exhaustive()
259 }
260}
261
262impl StrMatcher {
263 /// Compile `pattern` with default options (no minimum tier, no
264 /// case-insensitivity).
265 ///
266 /// # Errors
267 ///
268 /// Returns `Err` if the pattern is empty or fails to parse. See
269 /// [`BuildError`] for the structured variants.
270 pub fn new(pattern: &str) -> Result<Self, BuildError> {
271 Self::builder().build(pattern)
272 }
273
274 /// Start a builder.
275 #[must_use]
276 pub fn builder() -> StrMatcherBuilder {
277 StrMatcherBuilder::new()
278 }
279
280 /// `true` if the haystack contains a match.
281 #[inline]
282 #[must_use]
283 pub fn is_match(&self, hay: &[u8]) -> bool {
284 self.plan.is_match(hay)
285 }
286
287 /// Find the first match, returning byte offsets.
288 #[inline]
289 #[must_use]
290 pub fn find(&self, hay: &[u8]) -> Option<Match> {
291 self.plan.find(hay)
292 }
293
294 /// Collect every non-overlapping match in iteration order.
295 ///
296 /// Eager: one allocation per call (lifetime/vtable reasons). Amortises
297 /// well over scrubber workloads (0-3 matches per haystack). Anchored
298 /// shapes (`StartsWith`/`EndsWith`/`ExactMatch`) yield at most one
299 /// match.
300 #[must_use]
301 pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<Match> {
302 let mut out = Vec::new();
303 self.plan.collect_matches(hay, &mut out);
304 out.into_iter()
305 }
306
307 /// The tier this matcher dispatches to. Useful for telemetry and
308 /// assertions in tests.
309 #[must_use]
310 pub fn tier(&self) -> MatcherTier {
311 self.tier
312 }
313
314 /// The original pattern string passed to [`Self::new`] (or the
315 /// builder).
316 #[must_use]
317 pub fn pattern(&self) -> &str {
318 &self.pattern
319 }
320
321 /// Short machine-readable reason for the tier choice. For the
322 /// fast-path tiers this is a "success" reason (e.g.
323 /// `"shape:starts-with"`). For Regex fall-through it names the
324 /// disqualifying feature (`"unbounded-quantifier"`,
325 /// `"word-boundary"`, ...).
326 #[must_use]
327 pub fn reason(&self) -> &'static str {
328 self.reason
329 }
330}
331
332// ---------------------------------------------------------------------------
333// StrMatcherBuilder
334// ---------------------------------------------------------------------------
335
336/// Builder for [`StrMatcher`]. Carries minimum-tier policy and the
337/// case-insensitivity flag.
338#[derive(Debug, Clone, Default)]
339pub struct StrMatcherBuilder {
340 min_tier: Option<MatcherTier>,
341 on_below_min: OnBelowMin,
342 ascii_case_insensitive: bool,
343}
344
345impl StrMatcherBuilder {
346 /// Create a builder with default options (no minimum tier, no
347 /// case-insensitivity).
348 #[must_use]
349 pub fn new() -> Self {
350 Self::default()
351 }
352
353 /// Reject (or warn about) patterns that classify below this tier.
354 ///
355 /// Tier ordering (highest -> lowest): `Byte > Literal > LiteralSet >
356 /// Regex`. Setting `min_tier(LiteralSet)` allows Byte/Literal/
357 /// LiteralSet but triggers `on_below_min` for Regex.
358 #[must_use]
359 pub fn min_tier(mut self, tier: MatcherTier) -> Self {
360 self.min_tier = Some(tier);
361 self
362 }
363
364 /// What to do when a pattern falls below `min_tier`.
365 #[must_use]
366 pub fn on_below_min(mut self, policy: OnBelowMin) -> Self {
367 self.on_below_min = policy;
368 self
369 }
370
371 /// Build a case-insensitive matcher.
372 ///
373 /// The pattern is wrapped in `(?i:...)` before parsing. Both the
374 /// literal and shape tiers honour the flag via
375 /// `aho_corasick::AhoCorasickBuilder::ascii_case_insensitive`;
376 /// shape-tier dispatch downgrades to the literal tier when this
377 /// flag is set, since `memchr` does not fold case.
378 #[must_use]
379 pub fn ascii_case_insensitive(mut self, enabled: bool) -> Self {
380 self.ascii_case_insensitive = enabled;
381 self
382 }
383
384 /// Build the matcher.
385 ///
386 /// # Errors
387 ///
388 /// Returns `Err` if the pattern fails to parse or violates the
389 /// `min_tier` policy with `on_below_min = Reject`.
390 pub fn build(&self, pattern: &str) -> Result<StrMatcher, BuildError> {
391 let classified = classify::classify(pattern, self.ascii_case_insensitive)?;
392 let tier = classified.descriptor.tier;
393
394 if let Some(min) = self.min_tier
395 && tier.rank() < min.rank()
396 {
397 return self.apply_below_min(pattern, min, classified);
398 }
399
400 if tier == MatcherTier::Regex {
401 warn::on_regex_fallback(
402 pattern,
403 classified.descriptor.reason,
404 classified.descriptor.hint,
405 /*force=*/ false,
406 );
407 metrics_inc_fallback();
408 }
409
410 Ok(StrMatcher {
411 plan: classified.plan,
412 tier,
413 pattern: pattern.to_string(),
414 reason: classified.descriptor.reason,
415 })
416 }
417
418 fn apply_below_min(
419 &self,
420 pattern: &str,
421 wanted: MatcherTier,
422 classified: Classified,
423 ) -> Result<StrMatcher, BuildError> {
424 let got = classified.descriptor.tier;
425 let reason = classified.descriptor.reason;
426 let hint = classified.descriptor.hint;
427
428 match self.on_below_min {
429 OnBelowMin::Reject => Err(BuildError::TierTooLow {
430 pattern: pattern.to_string(),
431 wanted,
432 got,
433 reason,
434 hint,
435 }),
436 OnBelowMin::Warn => {
437 warn::on_regex_fallback(pattern, reason, hint, /*force=*/ true);
438 metrics_inc_fallback();
439 Ok(StrMatcher {
440 plan: classified.plan,
441 tier: got,
442 pattern: pattern.to_string(),
443 reason,
444 })
445 }
446 OnBelowMin::Allow => {
447 if got == MatcherTier::Regex {
448 warn::on_regex_fallback(pattern, reason, hint, /*force=*/ false);
449 metrics_inc_fallback();
450 }
451 Ok(StrMatcher {
452 plan: classified.plan,
453 tier: got,
454 pattern: pattern.to_string(),
455 reason,
456 })
457 }
458 }
459 }
460}
461
462// ---------------------------------------------------------------------------
463// StrMatcherSet -- multi-pattern, single AC scan where possible
464// ---------------------------------------------------------------------------
465
466/// Multi-pattern matcher.
467///
468/// Build-time partition:
469///
470/// - Unanchored `Byte` / `Literal` / `LiteralSet` patterns fold into
471/// one shared aho-corasick. One linear scan covers them all --
472/// O(M + total_matches).
473/// - Anchored patterns (`^foo`, `bar$`, `^baz$`) and `Regex` fall-
474/// throughs stay as per-pattern matchers.
475///
476/// The merged AC runs `MatchKind::LeftmostLongest` -- overlapping
477/// literals like `AKIA` vs `AKIA1234` resolve to the longer match.
478/// Pattern indices survive the merge: `SetMatch.pattern_idx`
479/// always matches the caller's input order.
480pub struct StrMatcherSet {
481 merged: Option<MergedAc>,
482 /// Anchored / regex-tier patterns, paired with input index.
483 individual: Vec<(usize, StrMatcher)>,
484 /// Tier per input pattern, in input order. Used for `tier_counts`.
485 tiers: Vec<MatcherTier>,
486}
487
488/// Cross-pattern AC. `pattern_indices[ac_id]` maps an AC pattern ID
489/// back to the caller's input index -- one input pattern can
490/// contribute multiple literals (alternation, byte-set), so indices
491/// repeat.
492struct MergedAc {
493 ac: aho_corasick::AhoCorasick,
494 pattern_indices: Vec<usize>,
495}
496
497impl StrMatcherSet {
498 /// Compile every pattern. Patterns that violate the builder's
499 /// `min_tier` policy with `OnBelowMin::Reject` fail the build for
500 /// the entire set -- returning the first failing pattern's error.
501 ///
502 /// # Errors
503 ///
504 /// As [`StrMatcherBuilder::build`].
505 pub fn new<I, S>(patterns: I) -> Result<Self, BuildError>
506 where
507 I: IntoIterator<Item = S>,
508 S: AsRef<str>,
509 {
510 Self::builder().build_set(patterns)
511 }
512
513 /// Start a builder.
514 #[must_use]
515 pub fn builder() -> StrMatcherBuilder {
516 StrMatcherBuilder::new()
517 }
518
519 /// `true` if any pattern matches anywhere in the haystack.
520 #[must_use]
521 pub fn is_match(&self, hay: &[u8]) -> bool {
522 if let Some(m) = &self.merged
523 && m.ac.find(hay).is_some()
524 {
525 return true;
526 }
527 self.individual.iter().any(|(_, m)| m.is_match(hay))
528 }
529
530 /// Find the earliest match across all patterns.
531 ///
532 /// Order:
533 /// - Lower `start` wins.
534 /// - At the same `start`, the merged AC applies
535 /// `MatchKind::LeftmostLongest` *within* its literal set, so
536 /// the longer literal wins among merged patterns (e.g.
537 /// `AKIA1234` over `AKIA`).
538 /// - Across the merge boundary (merged-AC hit vs an individual
539 /// anchored / regex matcher at the same `start`), the
540 /// lower `pattern_idx` wins -- input-order preserved.
541 #[must_use]
542 pub fn earliest_match(&self, hay: &[u8]) -> Option<SetMatch> {
543 let mut best: Option<SetMatch> = None;
544 if let Some(m) = &self.merged
545 && let Some(hit) = m.ac.find(hay)
546 {
547 best = Some(SetMatch {
548 start: hit.start(),
549 end: hit.end(),
550 pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
551 });
552 }
553 for (idx, matcher) in &self.individual {
554 if let Some(found) = matcher.find(hay) {
555 let cand = SetMatch {
556 start: found.start,
557 end: found.end,
558 pattern_idx: *idx,
559 };
560 best = match best {
561 None => Some(cand),
562 Some(b) if cand.start < b.start => Some(cand),
563 Some(b) if cand.start == b.start && cand.pattern_idx < b.pattern_idx => {
564 Some(cand)
565 }
566 Some(b) => Some(b),
567 };
568 }
569 }
570 best
571 }
572
573 /// Collect every non-overlapping match across all patterns,
574 /// sorted by `(start, pattern_idx)`.
575 ///
576 /// The merged AC runs `MatchKind::LeftmostLongest` internally,
577 /// so when two literals overlap within the merged set, the
578 /// longer one wins. Across the merge boundary the sort key
579 /// gives input-order tie-breaking.
580 ///
581 /// When multiple patterns match at the same position, the
582 /// lower-indexed pattern wins (consistent with the input order).
583 /// Matches across different patterns may overlap.
584 #[must_use]
585 pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<SetMatch> {
586 let mut all: Vec<SetMatch> = Vec::new();
587 if let Some(m) = &self.merged {
588 for hit in m.ac.find_iter(hay) {
589 all.push(SetMatch {
590 start: hit.start(),
591 end: hit.end(),
592 pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
593 });
594 }
595 }
596 for (idx, matcher) in &self.individual {
597 for hit in matcher.find_iter(hay) {
598 all.push(SetMatch {
599 start: hit.start,
600 end: hit.end,
601 pattern_idx: *idx,
602 });
603 }
604 }
605 all.sort_by_key(|m| (m.start, m.pattern_idx));
606 all.into_iter()
607 }
608
609 /// Per-tier population counts: `[Byte, Literal, LiteralSet, Regex]`.
610 /// Useful for operator dashboards / quality gates.
611 #[must_use]
612 pub fn tier_counts(&self) -> [usize; 4] {
613 let mut counts = [0_usize; 4];
614 for t in &self.tiers {
615 let idx = match t {
616 MatcherTier::Byte => 0,
617 MatcherTier::Literal => 1,
618 MatcherTier::LiteralSet => 2,
619 MatcherTier::Regex => 3,
620 };
621 counts[idx] += 1;
622 }
623 counts
624 }
625
626 /// Number of patterns in the set.
627 #[must_use]
628 pub fn len(&self) -> usize {
629 self.tiers.len()
630 }
631
632 /// `true` if the set has no patterns.
633 #[must_use]
634 pub fn is_empty(&self) -> bool {
635 self.tiers.is_empty()
636 }
637}
638
639impl StrMatcherBuilder {
640 /// Build a [`StrMatcherSet`] from an iterator of patterns.
641 ///
642 /// `min_tier` + `on_below_min` apply per pattern. First failure
643 /// short-circuits the build. Unanchored literals fold into a
644 /// merged AC; anchored / regex patterns stay individual.
645 ///
646 /// # Errors
647 ///
648 /// As [`StrMatcherBuilder::build`].
649 pub fn build_set<I, S>(&self, patterns: I) -> Result<StrMatcherSet, BuildError>
650 where
651 I: IntoIterator<Item = S>,
652 S: AsRef<str>,
653 {
654 let mut individual: Vec<(usize, StrMatcher)> = Vec::new();
655 let mut merge_lits: Vec<Vec<u8>> = Vec::new();
656 let mut pattern_indices: Vec<usize> = Vec::new();
657 let mut tiers: Vec<MatcherTier> = Vec::new();
658
659 for (idx, pat) in patterns.into_iter().enumerate() {
660 let m = self.build(pat.as_ref())?;
661 tiers.push(m.tier);
662 match m.plan.merge_literals() {
663 Some((lits, ci)) if ci == self.ascii_case_insensitive => {
664 for lit in lits {
665 pattern_indices.push(idx);
666 merge_lits.push(lit);
667 }
668 }
669 _ => individual.push((idx, m)),
670 }
671 }
672
673 // AC build is infallible on a pre-validated literal set --
674 // empty merge_lits short-circuits to None above, and the
675 // classifier already enforces LITERAL_SET_CAP. A failure
676 // here would mean a contract violation, so panic.
677 let merged = (!merge_lits.is_empty()).then(|| {
678 let ac = aho_corasick::AhoCorasickBuilder::new()
679 .ascii_case_insensitive(self.ascii_case_insensitive)
680 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
681 .build(&merge_lits)
682 .expect("strmatch: merged AC build failed on a pre-validated literal set");
683 MergedAc {
684 ac,
685 pattern_indices,
686 }
687 });
688
689 Ok(StrMatcherSet {
690 merged,
691 individual,
692 tiers,
693 })
694 }
695}
696
697// The inherent single-pattern build method already exists above; the
698// trait-style impl below disambiguates against the iterator version.
699// Rust resolves single-pattern `&str` to the inherent method because
700// `IntoIterator<Item = S: AsRef<str>>` is not implemented for `&str`
701// (a `&str` doesn't iter into `&str`s).
702
703// ---------------------------------------------------------------------------
704// Anti-spam log helper
705// ---------------------------------------------------------------------------
706
707mod warn {
708 use super::{AtomicBool, AtomicUsize, HashSet, LazyLock, Mutex, Ordering};
709
710 /// Soft cap on distinct WARN-level fall-back log lines per process.
711 /// Past this we step down to DEBUG and emit one INFO summary.
712 pub(super) const WARN_CAP: usize = 10;
713
714 static WARNED_HASHES: LazyLock<Mutex<HashSet<u64>>> =
715 LazyLock::new(|| Mutex::new(HashSet::new()));
716 static DISTINCT: AtomicUsize = AtomicUsize::new(0);
717 static SUMMARY_EMITTED: AtomicBool = AtomicBool::new(false);
718
719 fn hash_pattern(pattern: &str) -> u64 {
720 use std::collections::hash_map::DefaultHasher;
721 use std::hash::{Hash, Hasher};
722 let mut h = DefaultHasher::new();
723 pattern.hash(&mut h);
724 h.finish()
725 }
726
727 /// Run the dedup + cap protocol. `force = true` always emits a
728 /// WARN (used by `OnBelowMin::Warn` callers who explicitly want
729 /// the log line).
730 pub(super) fn on_regex_fallback(
731 pattern: &str,
732 reason: &'static str,
733 hint: &'static str,
734 force: bool,
735 ) {
736 let h = hash_pattern(pattern);
737 let already_warned = {
738 let mut set = WARNED_HASHES.lock().unwrap_or_else(|e| e.into_inner());
739 !set.insert(h)
740 };
741
742 if already_warned && !force {
743 // Already warned about this pattern; respect the dedup.
744 return;
745 }
746
747 let n = if force {
748 // Force path doesn't count toward the cap.
749 DISTINCT.load(Ordering::Relaxed)
750 } else {
751 DISTINCT.fetch_add(1, Ordering::Relaxed) + 1
752 };
753
754 if force || n <= WARN_CAP {
755 tracing::warn!(
756 target: "hyperi_rustlib::strmatch",
757 pattern,
758 reason,
759 hint,
760 "pattern falls through to regex engine on hot path"
761 );
762 } else {
763 tracing::debug!(
764 target: "hyperi_rustlib::strmatch",
765 pattern,
766 reason,
767 hint,
768 "regex fallback (WARN suppressed past cap)"
769 );
770 }
771
772 if !force && n == WARN_CAP + 1 && !SUMMARY_EMITTED.swap(true, Ordering::Relaxed) {
773 tracing::info!(
774 target: "hyperi_rustlib::strmatch",
775 cap = WARN_CAP,
776 "{}+ distinct patterns have fallen through to the regex engine; \
777 further fall-throughs log at DEBUG. Inspect StrMatcher::tier() / \
778 StrMatcherSet::tier_counts() at runtime, or scrape the \
779 hyperi_strmatch_regex_fallback_total metric.",
780 WARN_CAP,
781 );
782 }
783 }
784
785 /// Reset state for tests. Symbol exists only under `cfg(test)` so
786 /// production callers can't accidentally use it.
787 #[cfg(test)]
788 pub(super) fn reset_for_tests() {
789 WARNED_HASHES
790 .lock()
791 .unwrap_or_else(|e| e.into_inner())
792 .clear();
793 DISTINCT.store(0, Ordering::Relaxed);
794 SUMMARY_EMITTED.store(false, Ordering::Relaxed);
795 }
796}
797
798#[inline]
799fn metrics_inc_fallback() {
800 #[cfg(feature = "metrics")]
801 metrics::counter!("hyperi_strmatch_regex_fallback_total").increment(1);
802}
803
804// Re-export the warn-state reset for integration tests inside the
805// crate. Hidden from doc and not part of the public API.
806#[cfg(test)]
807#[doc(hidden)]
808pub fn reset_warn_state_for_tests() {
809 warn::reset_for_tests();
810}
811
812// ---------------------------------------------------------------------------
813// Tests
814// ---------------------------------------------------------------------------
815
816#[cfg(test)]
817mod tests;