hyperi_rustlib/strmatch/mod.rs
1// Project: hyperi-rustlib
2// File: src/strmatch/mod.rs
3// Purpose: Public API for the strmatch regex→fast-path matcher
4// Language: Rust
5//
6// License: BUSL-1.1
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! Regex-shaped patterns, fast-path dispatch.
10//!
11//! Operators write a regex (the most familiar pattern language).
12//! `strmatch` classifies it into one of four tiers and dispatches at
13//! match time via the cheapest engine that's correct:
14//!
15//! - **Byte** (≤ 30 ns) -- direct byte ops: `memchr` / `memchr2` /
16//! `memchr3` / single-byte `starts_with` / `ends_with` / `==`.
17//! - **Literal** (≤ 200 ns) -- single multi-byte literal:
18//! `memmem::Finder` / multi-byte `starts_with` / `ends_with` / `==`.
19//! - **LiteralSet** (≤ 500 ns) -- `aho-corasick` over ≥ 2 literals
20//! (optional uniform anchor checked after the AC scan). Regex engine
21//! is never invoked.
22//! - **Regex** (engine-bounded) -- fall through to
23//! `regex-automata::meta::Regex`. Has its own internal prefilter
24//! pipeline; cost depends on pattern and haystack.
25//!
26//! Budgets are typical for a modern x86 server on a ~200-byte
27//! haystack; see [`MatcherTier::typical_budget_ns`] and
28//! `benches/strmatch.rs`.
29//!
30//! ## Anti-spam discipline
31//!
32//! When a pattern compiles to [`MatcherTier::Regex`] (the engine
33//! fall-back), `strmatch` emits **one** WARN per distinct pattern per
34//! process, capped at 10 distinct WARNs total. After the cap, further
35//! fall-through patterns log at DEBUG. A counter
36//! `hyperi_strmatch_regex_fallback_total` increments on every
37//! fall-through regardless of log level -- operators can scrape that
38//! rather than rely on logs.
39//!
40//! ## Quality gates
41//!
42//! Use [`StrMatcher::builder`] with [`StrMatcherBuilder::min_tier`] to
43//! reject (or loudly warn about) patterns that fall below an
44//! operator-chosen tier. Useful for hot-path configs where regex
45//! fall-through is unacceptable.
46//!
47//! ## Example
48//!
49//! ```
50//! use hyperi_rustlib::strmatch::{MatcherTier, OnBelowMin, StrMatcher};
51//!
52//! // Byte tier -- anchored single byte, dispatches to hay.first() == Some(b)
53//! let m = StrMatcher::new(r"^/")?;
54//! assert_eq!(m.tier(), MatcherTier::Byte);
55//! assert!(m.is_match(b"/api/v1/orders"));
56//!
57//! // Literal tier -- multi-byte literal, dispatches to memmem
58//! let m = StrMatcher::new(r"AKIA")?;
59//! assert_eq!(m.tier(), MatcherTier::Literal);
60//! assert!(m.is_match(b"... AKIA1234 ..."));
61//!
62//! // LiteralSet tier -- alternation, dispatches to AhoCorasick
63//! let m = StrMatcher::new(r"AKIA|ghp_|sk_live_")?;
64//! assert_eq!(m.tier(), MatcherTier::LiteralSet);
65//! assert!(m.is_match(b"github token: ghp_abcdef"));
66//!
67//! // Regex tier -- falls through to engine; refuse the build instead
68//! let err = StrMatcher::builder()
69//! .min_tier(MatcherTier::LiteralSet)
70//! .on_below_min(OnBelowMin::Reject)
71//! .build(r"\w+@\w+")
72//! .unwrap_err();
73//! assert!(err.to_string().contains("tier"));
74//! # Ok::<(), hyperi_rustlib::strmatch::BuildError>(())
75//! ```
76
77mod classify;
78mod plan;
79
80use std::collections::HashSet;
81use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
82use std::sync::{LazyLock, Mutex};
83
84use thiserror::Error;
85
86use classify::Classified;
87use plan::Plan;
88
89// ---------------------------------------------------------------------------
90// Public types
91// ---------------------------------------------------------------------------
92
93/// Which engine class a [`StrMatcher`] is dispatching to. Tiers are
94/// ordered by cost -- `Byte > Literal > LiteralSet > Regex` (higher
95/// means faster). Use [`Self::rank`] for `min_tier` comparisons.
96///
97/// Budgets below are typical for a modern x86 server on a ~200-byte
98/// haystack. Hardware-dependent; cold caches and pathological inputs
99/// move them around.
100#[non_exhaustive]
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
102pub enum MatcherTier {
103 /// **≤ 30 ns** -- direct byte ops: `memchr` / `memchr2/3` / single-byte
104 /// `starts_with` / `ends_with` / `==`.
105 Byte,
106 /// **≤ 200 ns** -- single multi-byte literal: `memmem::Finder` /
107 /// multi-byte `starts_with` / `ends_with` / `==`.
108 Literal,
109 /// **≤ 500 ns** -- `aho-corasick` over ≥ 2 literals (optional
110 /// uniform anchor checked after the AC scan).
111 LiteralSet,
112 /// **Bounded by the regex engine** -- `regex-automata::meta::Regex`
113 /// fall-through. Cost depends on pattern and haystack.
114 Regex,
115}
116
117impl std::fmt::Display for MatcherTier {
118 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119 match self {
120 Self::Byte => f.write_str("Byte"),
121 Self::Literal => f.write_str("Literal"),
122 Self::LiteralSet => f.write_str("LiteralSet"),
123 Self::Regex => f.write_str("Regex"),
124 }
125 }
126}
127
128impl MatcherTier {
129 /// Cost rank -- higher means faster. Useful for `min_tier`
130 /// comparisons without depending on the specific ns numbers.
131 ///
132 /// Ordering: `Byte (4) > Literal (3) > LiteralSet (2) > Regex (1)`.
133 #[must_use]
134 pub const fn rank(self) -> u8 {
135 match self {
136 Self::Byte => 4,
137 Self::Literal => 3,
138 Self::LiteralSet => 2,
139 Self::Regex => 1,
140 }
141 }
142
143 /// Indicative ns budget per `is_match` call on a ~200-byte
144 /// haystack (modern x86 server). `None` for [`Self::Regex`] since
145 /// the engine is unbounded.
146 ///
147 /// Numbers are estimates from benchmarks in `benches/strmatch.rs`;
148 /// real-world cost varies with haystack length, cache state, and
149 /// pattern complexity.
150 #[must_use]
151 pub const fn typical_budget_ns(self) -> Option<u64> {
152 match self {
153 Self::Byte => Some(30),
154 Self::Literal => Some(200),
155 Self::LiteralSet => Some(500),
156 Self::Regex => None,
157 }
158 }
159}
160
161/// Byte offsets of a match. End is exclusive: `&hay[start..end]` is the
162/// matched slice.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164#[non_exhaustive]
165pub struct Match {
166 pub start: usize,
167 pub end: usize,
168}
169
170/// Like [`Match`] but also identifies which input pattern matched in a
171/// [`StrMatcherSet`].
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173#[non_exhaustive]
174pub struct SetMatch {
175 pub start: usize,
176 pub end: usize,
177 pub pattern_idx: usize,
178}
179
180/// Failure modes during construction.
181#[derive(Debug, Error)]
182#[non_exhaustive]
183pub enum BuildError {
184 /// The pattern is empty.
185 #[error(
186 "strmatch: empty pattern\n \
187 hint: an empty pattern matches every position; pass a non-empty \
188 literal or wrap the matcher in Option for an absent-matcher slot"
189 )]
190 Empty,
191
192 /// The regex parser rejected the pattern.
193 #[error(
194 "strmatch: regex syntax error in pattern {pattern:?}\n \
195 reason: {source}\n \
196 hint: {hint}"
197 )]
198 Syntax {
199 pattern: String,
200 #[source]
201 source: Box<regex_syntax::Error>,
202 hint: &'static str,
203 },
204
205 /// The pattern compiled to a tier below the builder's minimum.
206 #[error(
207 "strmatch: pattern {pattern:?} compiles to tier {got}, but builder \
208 requires at least {wanted}\n \
209 reason: {reason}\n \
210 hint: {hint}"
211 )]
212 TierTooLow {
213 pattern: String,
214 wanted: MatcherTier,
215 got: MatcherTier,
216 reason: &'static str,
217 hint: &'static str,
218 },
219}
220
221/// What to do when a pattern's classification falls below the
222/// builder's `min_tier`.
223#[non_exhaustive]
224#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
225pub enum OnBelowMin {
226 /// Default -- proceed quietly. The anti-spam protocol still emits
227 /// up to ten WARNs per process for fall-through patterns.
228 #[default]
229 Allow,
230 /// Proceed, but force an immediate WARN that bypasses the anti-spam
231 /// cap. Use when the caller explicitly wants every fall-through
232 /// logged regardless of process-wide dedup state.
233 Warn,
234 /// Refuse to build. Returns [`BuildError::TierTooLow`].
235 Reject,
236}
237
238// ---------------------------------------------------------------------------
239// StrMatcher
240// ---------------------------------------------------------------------------
241
242/// Compiled pattern with tier-aware dispatch.
243pub struct StrMatcher {
244 plan: Plan,
245 tier: MatcherTier,
246 pattern: String,
247 /// Short machine-readable reason (e.g. `"shape:starts-with"`,
248 /// `"literal-only:alternation"`, `"unbounded-quantifier"`).
249 reason: &'static str,
250}
251
252impl std::fmt::Debug for StrMatcher {
253 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254 f.debug_struct("StrMatcher")
255 .field("tier", &self.tier)
256 .field("pattern", &self.pattern)
257 .field("reason", &self.reason)
258 .finish_non_exhaustive()
259 }
260}
261
262impl StrMatcher {
263 /// Compile `pattern` with default options (no minimum tier, no
264 /// case-insensitivity).
265 ///
266 /// # Errors
267 ///
268 /// Returns `Err` if the pattern is empty or fails to parse. See
269 /// [`BuildError`] for the structured variants.
270 pub fn new(pattern: &str) -> Result<Self, BuildError> {
271 Self::builder().build(pattern)
272 }
273
274 /// Start a builder.
275 #[must_use]
276 pub fn builder() -> StrMatcherBuilder {
277 StrMatcherBuilder::new()
278 }
279
280 /// `true` if the haystack contains a match.
281 #[inline]
282 #[must_use]
283 pub fn is_match(&self, hay: &[u8]) -> bool {
284 self.plan.is_match(hay)
285 }
286
287 /// Find the first match, returning byte offsets.
288 #[inline]
289 #[must_use]
290 pub fn find(&self, hay: &[u8]) -> Option<Match> {
291 self.plan.find(hay)
292 }
293
294 /// Collect every non-overlapping match in iteration order.
295 ///
296 /// Returns an owned iterator over `Match`. The implementation
297 /// eagerly populates a `Vec` and returns its `IntoIter`; this
298 /// costs one allocation per call but amortises well over typical
299 /// scrubber workloads where each haystack yields 0-3 matches.
300 ///
301 /// For the anchored shapes (`StartsWith`, `EndsWith`,
302 /// `ExactMatch`) the iterator yields at most one match.
303 #[must_use]
304 pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<Match> {
305 let mut out = Vec::new();
306 self.plan.collect_matches(hay, &mut out);
307 out.into_iter()
308 }
309
310 /// The tier this matcher dispatches to. Useful for telemetry and
311 /// assertions in tests.
312 #[must_use]
313 pub fn tier(&self) -> MatcherTier {
314 self.tier
315 }
316
317 /// The original pattern string passed to [`Self::new`] (or the
318 /// builder).
319 #[must_use]
320 pub fn pattern(&self) -> &str {
321 &self.pattern
322 }
323
324 /// Short machine-readable reason for the tier choice. For Shape /
325 /// Literal this is a "success" reason (e.g. `"shape:starts-with"`).
326 /// For Meta this names the disqualifying feature
327 /// (`"unbounded-quantifier"`, `"word-boundary"`, …).
328 #[must_use]
329 pub fn reason(&self) -> &'static str {
330 self.reason
331 }
332}
333
334// ---------------------------------------------------------------------------
335// StrMatcherBuilder
336// ---------------------------------------------------------------------------
337
338/// Builder for [`StrMatcher`]. Carries minimum-tier policy and the
339/// case-insensitivity flag.
340#[derive(Debug, Clone, Default)]
341pub struct StrMatcherBuilder {
342 min_tier: Option<MatcherTier>,
343 on_below_min: OnBelowMin,
344 ascii_case_insensitive: bool,
345}
346
347impl StrMatcherBuilder {
348 /// Create a builder with default options (no minimum tier, no
349 /// case-insensitivity).
350 #[must_use]
351 pub fn new() -> Self {
352 Self::default()
353 }
354
355 /// Reject (or warn about) patterns that classify below this tier.
356 ///
357 /// Tier ordering (highest → lowest): `Shape > Literal > Meta`.
358 /// Setting `min_tier(Literal)` allows Shape and Literal but
359 /// triggers `on_below_min` for Meta.
360 #[must_use]
361 pub fn min_tier(mut self, tier: MatcherTier) -> Self {
362 self.min_tier = Some(tier);
363 self
364 }
365
366 /// What to do when a pattern falls below `min_tier`.
367 #[must_use]
368 pub fn on_below_min(mut self, policy: OnBelowMin) -> Self {
369 self.on_below_min = policy;
370 self
371 }
372
373 /// Build a case-insensitive matcher.
374 ///
375 /// The pattern is wrapped in `(?i:...)` before parsing. Both the
376 /// literal and shape tiers honour the flag via
377 /// `aho_corasick::AhoCorasickBuilder::ascii_case_insensitive`;
378 /// shape-tier dispatch downgrades to the literal tier when this
379 /// flag is set, since `memchr` does not fold case.
380 #[must_use]
381 pub fn ascii_case_insensitive(mut self, enabled: bool) -> Self {
382 self.ascii_case_insensitive = enabled;
383 self
384 }
385
386 /// Build the matcher.
387 ///
388 /// # Errors
389 ///
390 /// Returns `Err` if the pattern fails to parse or violates the
391 /// `min_tier` policy with `on_below_min = Reject`.
392 pub fn build(&self, pattern: &str) -> Result<StrMatcher, BuildError> {
393 let classified = classify::classify(pattern, self.ascii_case_insensitive)?;
394 let tier = classified.descriptor.tier;
395
396 if let Some(min) = self.min_tier
397 && tier.rank() < min.rank()
398 {
399 return self.apply_below_min(pattern, min, classified);
400 }
401
402 if tier == MatcherTier::Regex {
403 warn::on_regex_fallback(
404 pattern,
405 classified.descriptor.reason,
406 classified.descriptor.hint,
407 /*force=*/ false,
408 );
409 metrics_inc_fallback();
410 }
411
412 Ok(StrMatcher {
413 plan: classified.plan,
414 tier,
415 pattern: pattern.to_string(),
416 reason: classified.descriptor.reason,
417 })
418 }
419
420 fn apply_below_min(
421 &self,
422 pattern: &str,
423 wanted: MatcherTier,
424 classified: Classified,
425 ) -> Result<StrMatcher, BuildError> {
426 let got = classified.descriptor.tier;
427 let reason = classified.descriptor.reason;
428 let hint = classified.descriptor.hint;
429
430 match self.on_below_min {
431 OnBelowMin::Reject => Err(BuildError::TierTooLow {
432 pattern: pattern.to_string(),
433 wanted,
434 got,
435 reason,
436 hint,
437 }),
438 OnBelowMin::Warn => {
439 warn::on_regex_fallback(pattern, reason, hint, /*force=*/ true);
440 metrics_inc_fallback();
441 Ok(StrMatcher {
442 plan: classified.plan,
443 tier: got,
444 pattern: pattern.to_string(),
445 reason,
446 })
447 }
448 OnBelowMin::Allow => {
449 if got == MatcherTier::Regex {
450 warn::on_regex_fallback(pattern, reason, hint, /*force=*/ false);
451 metrics_inc_fallback();
452 }
453 Ok(StrMatcher {
454 plan: classified.plan,
455 tier: got,
456 pattern: pattern.to_string(),
457 reason,
458 })
459 }
460 }
461 }
462}
463
464// ---------------------------------------------------------------------------
465// StrMatcherSet -- multi-pattern, single AC scan where possible
466// ---------------------------------------------------------------------------
467
468/// Multi-pattern matcher.
469///
470/// Build-time partition:
471///
472/// - Unanchored `Byte` / `Literal` / `LiteralSet` patterns fold into
473/// one shared aho-corasick. One linear scan covers them all --
474/// O(M + total_matches).
475/// - Anchored patterns (`^foo`, `bar$`, `^baz$`) and `Regex` fall-
476/// throughs stay as per-pattern matchers.
477///
478/// The merged AC runs `MatchKind::LeftmostLongest` -- overlapping
479/// literals like `AKIA` vs `AKIA1234` resolve to the longer match.
480/// Pattern indices survive the merge: `SetMatch.pattern_idx`
481/// always matches the caller's input order.
482pub struct StrMatcherSet {
483 merged: Option<MergedAc>,
484 /// Anchored / regex-tier patterns, paired with input index.
485 individual: Vec<(usize, StrMatcher)>,
486 /// Tier per input pattern, in input order. Used for `tier_counts`.
487 tiers: Vec<MatcherTier>,
488}
489
490/// Cross-pattern AC. `pattern_indices[ac_id]` maps an AC pattern ID
491/// back to the caller's input index -- one input pattern can
492/// contribute multiple literals (alternation, byte-set), so indices
493/// repeat.
494struct MergedAc {
495 ac: aho_corasick::AhoCorasick,
496 pattern_indices: Vec<usize>,
497}
498
499impl StrMatcherSet {
500 /// Compile every pattern. Patterns that violate the builder's
501 /// `min_tier` policy with `OnBelowMin::Reject` fail the build for
502 /// the entire set -- returning the first failing pattern's error.
503 ///
504 /// # Errors
505 ///
506 /// As [`StrMatcherBuilder::build`].
507 pub fn new<I, S>(patterns: I) -> Result<Self, BuildError>
508 where
509 I: IntoIterator<Item = S>,
510 S: AsRef<str>,
511 {
512 Self::builder().build_set(patterns)
513 }
514
515 /// Start a builder.
516 #[must_use]
517 pub fn builder() -> StrMatcherBuilder {
518 StrMatcherBuilder::new()
519 }
520
521 /// `true` if any pattern matches anywhere in the haystack.
522 #[must_use]
523 pub fn is_match(&self, hay: &[u8]) -> bool {
524 if let Some(m) = &self.merged
525 && m.ac.find(hay).is_some()
526 {
527 return true;
528 }
529 self.individual.iter().any(|(_, m)| m.is_match(hay))
530 }
531
532 /// Find the earliest match across all patterns.
533 ///
534 /// Order:
535 /// - Lower `start` wins.
536 /// - At the same `start`, the merged AC applies
537 /// `MatchKind::LeftmostLongest` *within* its literal set, so
538 /// the longer literal wins among merged patterns (e.g.
539 /// `AKIA1234` over `AKIA`).
540 /// - Across the merge boundary (merged-AC hit vs an individual
541 /// anchored / regex matcher at the same `start`), the
542 /// lower `pattern_idx` wins -- input-order preserved.
543 #[must_use]
544 pub fn earliest_match(&self, hay: &[u8]) -> Option<SetMatch> {
545 let mut best: Option<SetMatch> = None;
546 if let Some(m) = &self.merged
547 && let Some(hit) = m.ac.find(hay)
548 {
549 best = Some(SetMatch {
550 start: hit.start(),
551 end: hit.end(),
552 pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
553 });
554 }
555 for (idx, matcher) in &self.individual {
556 if let Some(found) = matcher.find(hay) {
557 let cand = SetMatch {
558 start: found.start,
559 end: found.end,
560 pattern_idx: *idx,
561 };
562 best = match best {
563 None => Some(cand),
564 Some(b) if cand.start < b.start => Some(cand),
565 Some(b) if cand.start == b.start && cand.pattern_idx < b.pattern_idx => {
566 Some(cand)
567 }
568 Some(b) => Some(b),
569 };
570 }
571 }
572 best
573 }
574
575 /// Collect every non-overlapping match across all patterns,
576 /// sorted by `(start, pattern_idx)`.
577 ///
578 /// The merged AC runs `MatchKind::LeftmostLongest` internally,
579 /// so when two literals overlap within the merged set, the
580 /// longer one wins. Across the merge boundary the sort key
581 /// gives input-order tie-breaking.
582 ///
583 /// When multiple patterns match at the same position, the
584 /// lower-indexed pattern wins (consistent with the input order).
585 /// Matches across different patterns may overlap.
586 #[must_use]
587 pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<SetMatch> {
588 let mut all: Vec<SetMatch> = Vec::new();
589 if let Some(m) = &self.merged {
590 for hit in m.ac.find_iter(hay) {
591 all.push(SetMatch {
592 start: hit.start(),
593 end: hit.end(),
594 pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
595 });
596 }
597 }
598 for (idx, matcher) in &self.individual {
599 for hit in matcher.find_iter(hay) {
600 all.push(SetMatch {
601 start: hit.start,
602 end: hit.end,
603 pattern_idx: *idx,
604 });
605 }
606 }
607 all.sort_by_key(|m| (m.start, m.pattern_idx));
608 all.into_iter()
609 }
610
611 /// Per-tier population counts: `[Byte, Literal, LiteralSet, Regex]`.
612 /// Useful for operator dashboards / quality gates.
613 #[must_use]
614 pub fn tier_counts(&self) -> [usize; 4] {
615 let mut counts = [0_usize; 4];
616 for t in &self.tiers {
617 let idx = match t {
618 MatcherTier::Byte => 0,
619 MatcherTier::Literal => 1,
620 MatcherTier::LiteralSet => 2,
621 MatcherTier::Regex => 3,
622 };
623 counts[idx] += 1;
624 }
625 counts
626 }
627
628 /// Number of patterns in the set.
629 #[must_use]
630 pub fn len(&self) -> usize {
631 self.tiers.len()
632 }
633
634 /// `true` if the set has no patterns.
635 #[must_use]
636 pub fn is_empty(&self) -> bool {
637 self.tiers.is_empty()
638 }
639}
640
641impl StrMatcherBuilder {
642 /// Build a [`StrMatcherSet`] from an iterator of patterns.
643 ///
644 /// `min_tier` + `on_below_min` apply per pattern. First failure
645 /// short-circuits the build. Unanchored literals fold into a
646 /// merged AC; anchored / regex patterns stay individual.
647 ///
648 /// # Errors
649 ///
650 /// As [`StrMatcherBuilder::build`].
651 pub fn build_set<I, S>(&self, patterns: I) -> Result<StrMatcherSet, BuildError>
652 where
653 I: IntoIterator<Item = S>,
654 S: AsRef<str>,
655 {
656 let mut individual: Vec<(usize, StrMatcher)> = Vec::new();
657 let mut merge_lits: Vec<Vec<u8>> = Vec::new();
658 let mut pattern_indices: Vec<usize> = Vec::new();
659 let mut tiers: Vec<MatcherTier> = Vec::new();
660
661 for (idx, pat) in patterns.into_iter().enumerate() {
662 let m = self.build(pat.as_ref())?;
663 tiers.push(m.tier);
664 match m.plan.merge_literals() {
665 Some((lits, ci)) if ci == self.ascii_case_insensitive => {
666 for lit in lits {
667 pattern_indices.push(idx);
668 merge_lits.push(lit);
669 }
670 }
671 _ => individual.push((idx, m)),
672 }
673 }
674
675 // AC build is infallible on a pre-validated literal set --
676 // empty merge_lits short-circuits to None above, and the
677 // classifier already enforces LITERAL_SET_CAP. A failure
678 // here would mean a contract violation, so panic.
679 let merged = (!merge_lits.is_empty()).then(|| {
680 let ac = aho_corasick::AhoCorasickBuilder::new()
681 .ascii_case_insensitive(self.ascii_case_insensitive)
682 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
683 .build(&merge_lits)
684 .expect("strmatch: merged AC build failed on a pre-validated literal set");
685 MergedAc {
686 ac,
687 pattern_indices,
688 }
689 });
690
691 Ok(StrMatcherSet {
692 merged,
693 individual,
694 tiers,
695 })
696 }
697}
698
699// The inherent single-pattern build method already exists above; the
700// trait-style impl below disambiguates against the iterator version.
701// Rust resolves single-pattern `&str` to the inherent method because
702// `IntoIterator<Item = S: AsRef<str>>` is not implemented for `&str`
703// (a `&str` doesn't iter into `&str`s).
704
705// ---------------------------------------------------------------------------
706// Anti-spam log helper
707// ---------------------------------------------------------------------------
708
709mod warn {
710 use super::{AtomicBool, AtomicUsize, HashSet, LazyLock, Mutex, Ordering};
711
712 /// Soft cap on distinct WARN-level fall-back log lines per process.
713 /// Past this we step down to DEBUG and emit one INFO summary.
714 pub(super) const WARN_CAP: usize = 10;
715
716 static WARNED_HASHES: LazyLock<Mutex<HashSet<u64>>> =
717 LazyLock::new(|| Mutex::new(HashSet::new()));
718 static DISTINCT: AtomicUsize = AtomicUsize::new(0);
719 static SUMMARY_EMITTED: AtomicBool = AtomicBool::new(false);
720
721 fn hash_pattern(pattern: &str) -> u64 {
722 use std::collections::hash_map::DefaultHasher;
723 use std::hash::{Hash, Hasher};
724 let mut h = DefaultHasher::new();
725 pattern.hash(&mut h);
726 h.finish()
727 }
728
729 /// Run the dedup + cap protocol. `force = true` always emits a
730 /// WARN (used by `OnBelowMin::Warn` callers who explicitly want
731 /// the log line).
732 pub(super) fn on_regex_fallback(
733 pattern: &str,
734 reason: &'static str,
735 hint: &'static str,
736 force: bool,
737 ) {
738 let h = hash_pattern(pattern);
739 let already_warned = {
740 let mut set = WARNED_HASHES.lock().unwrap_or_else(|e| e.into_inner());
741 !set.insert(h)
742 };
743
744 if already_warned && !force {
745 // Already warned about this pattern; respect the dedup.
746 return;
747 }
748
749 let n = if force {
750 // Force path doesn't count toward the cap.
751 DISTINCT.load(Ordering::Relaxed)
752 } else {
753 DISTINCT.fetch_add(1, Ordering::Relaxed) + 1
754 };
755
756 if force || n <= WARN_CAP {
757 tracing::warn!(
758 target: "hyperi_rustlib::strmatch",
759 pattern,
760 reason,
761 hint,
762 "pattern falls through to regex engine on hot path"
763 );
764 } else {
765 tracing::debug!(
766 target: "hyperi_rustlib::strmatch",
767 pattern,
768 reason,
769 hint,
770 "regex fallback (WARN suppressed past cap)"
771 );
772 }
773
774 if !force && n == WARN_CAP + 1 && !SUMMARY_EMITTED.swap(true, Ordering::Relaxed) {
775 tracing::info!(
776 target: "hyperi_rustlib::strmatch",
777 cap = WARN_CAP,
778 "{}+ distinct patterns have fallen through to the regex engine; \
779 further fall-throughs log at DEBUG. Inspect StrMatcher::tier() / \
780 StrMatcherSet::tier_counts() at runtime, or scrape the \
781 hyperi_strmatch_regex_fallback_total metric.",
782 WARN_CAP,
783 );
784 }
785 }
786
787 /// Reset state for tests. Symbol exists only under `cfg(test)` so
788 /// production callers can't accidentally use it.
789 #[cfg(test)]
790 pub(super) fn reset_for_tests() {
791 WARNED_HASHES
792 .lock()
793 .unwrap_or_else(|e| e.into_inner())
794 .clear();
795 DISTINCT.store(0, Ordering::Relaxed);
796 SUMMARY_EMITTED.store(false, Ordering::Relaxed);
797 }
798}
799
800#[inline]
801fn metrics_inc_fallback() {
802 #[cfg(feature = "metrics")]
803 metrics::counter!("hyperi_strmatch_regex_fallback_total").increment(1);
804}
805
806// Re-export the warn-state reset for integration tests inside the
807// crate. Hidden from doc and not part of the public API.
808#[cfg(test)]
809#[doc(hidden)]
810pub fn reset_warn_state_for_tests() {
811 warn::reset_for_tests();
812}
813
814// ---------------------------------------------------------------------------
815// Tests
816// ---------------------------------------------------------------------------
817
818#[cfg(test)]
819mod tests;