Skip to main content

hyperi_rustlib/strmatch/
mod.rs

1// Project:   hyperi-rustlib
2// File:      src/strmatch/mod.rs
3// Purpose:   Public API for the strmatch regex→fast-path matcher
4// Language:  Rust
5//
6// License:   FSL-1.1-ALv2
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! Regex-shaped patterns, fast-path dispatch.
10//!
11//! Operators write a regex (the most familiar pattern language).
12//! `strmatch` classifies it into one of four tiers and dispatches at
13//! match time via the cheapest engine that's correct:
14//!
15//! - **Byte** (≤ 30 ns) -- direct byte ops: `memchr` / `memchr2` /
16//!   `memchr3` / single-byte `starts_with` / `ends_with` / `==`.
17//! - **Literal** (≤ 200 ns) -- single multi-byte literal:
18//!   `memmem::Finder` / multi-byte `starts_with` / `ends_with` / `==`.
19//! - **LiteralSet** (≤ 500 ns) -- `aho-corasick` over ≥ 2 literals
20//!   (optional uniform anchor checked after the AC scan). Regex engine
21//!   is never invoked.
22//! - **Regex** (engine-bounded) -- fall through to
23//!   `regex-automata::meta::Regex`. Has its own internal prefilter
24//!   pipeline; cost depends on pattern and haystack.
25//!
26//! Budgets are typical for a modern x86 server on a ~200-byte
27//! haystack; see [`MatcherTier::typical_budget_ns`] and
28//! `benches/strmatch.rs`.
29//!
30//! ## Anti-spam discipline
31//!
32//! When a pattern compiles to [`MatcherTier::Regex`] (the engine
33//! fall-back), `strmatch` emits **one** WARN per distinct pattern per
34//! process, capped at 10 distinct WARNs total. After the cap, further
35//! fall-through patterns log at DEBUG. A counter
36//! `hyperi_strmatch_regex_fallback_total` increments on every
37//! fall-through regardless of log level -- operators can scrape that
38//! rather than rely on logs.
39//!
40//! ## Quality gates
41//!
42//! Use [`StrMatcher::builder`] with [`StrMatcherBuilder::min_tier`] to
43//! reject (or loudly warn about) patterns that fall below an
44//! operator-chosen tier. Useful for hot-path configs where regex
45//! fall-through is unacceptable.
46//!
47//! ## Example
48//!
49//! ```
50//! use hyperi_rustlib::strmatch::{MatcherTier, OnBelowMin, StrMatcher};
51//!
52//! // Byte tier -- anchored single byte, dispatches to hay.first() == Some(b)
53//! let m = StrMatcher::new(r"^/")?;
54//! assert_eq!(m.tier(), MatcherTier::Byte);
55//! assert!(m.is_match(b"/api/v1/orders"));
56//!
57//! // Literal tier -- multi-byte literal, dispatches to memmem
58//! let m = StrMatcher::new(r"AKIA")?;
59//! assert_eq!(m.tier(), MatcherTier::Literal);
60//! assert!(m.is_match(b"... AKIA1234 ..."));
61//!
62//! // LiteralSet tier -- alternation, dispatches to AhoCorasick
63//! let m = StrMatcher::new(r"AKIA|ghp_|sk_live_")?;
64//! assert_eq!(m.tier(), MatcherTier::LiteralSet);
65//! assert!(m.is_match(b"github token: ghp_abcdef"));
66//!
67//! // Regex tier -- falls through to engine; refuse the build instead
68//! let err = StrMatcher::builder()
69//!     .min_tier(MatcherTier::LiteralSet)
70//!     .on_below_min(OnBelowMin::Reject)
71//!     .build(r"\w+@\w+")
72//!     .unwrap_err();
73//! assert!(err.to_string().contains("tier"));
74//! # Ok::<(), hyperi_rustlib::strmatch::BuildError>(())
75//! ```
76
77mod classify;
78mod plan;
79
80use std::collections::HashSet;
81use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
82use std::sync::{LazyLock, Mutex};
83
84use thiserror::Error;
85
86use classify::Classified;
87use plan::Plan;
88
89// ---------------------------------------------------------------------------
90// Public types
91// ---------------------------------------------------------------------------
92
93/// Which engine class a [`StrMatcher`] is dispatching to. Tiers are
94/// ordered by cost -- `Byte > Literal > LiteralSet > Regex` (higher
95/// means faster). Use [`Self::rank`] for `min_tier` comparisons.
96///
97/// Budgets below are typical for a modern x86 server on a ~200-byte
98/// haystack. Hardware-dependent; cold caches and pathological inputs
99/// move them around.
100#[non_exhaustive]
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
102pub enum MatcherTier {
103    /// **≤ 30 ns** -- direct byte ops: `memchr` / `memchr2/3` / single-byte
104    /// `starts_with` / `ends_with` / `==`.
105    Byte,
106    /// **≤ 200 ns** -- single multi-byte literal: `memmem::Finder` /
107    /// multi-byte `starts_with` / `ends_with` / `==`.
108    Literal,
109    /// **≤ 500 ns** -- `aho-corasick` over ≥ 2 literals (optional
110    /// uniform anchor checked after the AC scan).
111    LiteralSet,
112    /// **Bounded by the regex engine** -- `regex-automata::meta::Regex`
113    /// fall-through. Cost depends on pattern and haystack.
114    Regex,
115}
116
117impl std::fmt::Display for MatcherTier {
118    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119        match self {
120            Self::Byte => f.write_str("Byte"),
121            Self::Literal => f.write_str("Literal"),
122            Self::LiteralSet => f.write_str("LiteralSet"),
123            Self::Regex => f.write_str("Regex"),
124        }
125    }
126}
127
128impl MatcherTier {
129    /// Cost rank -- higher means faster. Useful for `min_tier`
130    /// comparisons without depending on the specific ns numbers.
131    ///
132    /// Ordering: `Byte (4) > Literal (3) > LiteralSet (2) > Regex (1)`.
133    #[must_use]
134    pub const fn rank(self) -> u8 {
135        match self {
136            Self::Byte => 4,
137            Self::Literal => 3,
138            Self::LiteralSet => 2,
139            Self::Regex => 1,
140        }
141    }
142
143    /// Indicative ns budget per `is_match` call on a ~200-byte
144    /// haystack (modern x86 server). `None` for [`Self::Regex`] since
145    /// the engine is unbounded.
146    ///
147    /// Numbers are estimates from benchmarks in `benches/strmatch.rs`;
148    /// real-world cost varies with haystack length, cache state, and
149    /// pattern complexity.
150    #[must_use]
151    pub const fn typical_budget_ns(self) -> Option<u64> {
152        match self {
153            Self::Byte => Some(30),
154            Self::Literal => Some(200),
155            Self::LiteralSet => Some(500),
156            Self::Regex => None,
157        }
158    }
159}
160
161/// Byte offsets of a match. End is exclusive: `&hay[start..end]` is the
162/// matched slice.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164#[non_exhaustive]
165pub struct Match {
166    pub start: usize,
167    pub end: usize,
168}
169
170/// Like [`Match`] but also identifies which input pattern matched in a
171/// [`StrMatcherSet`].
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173#[non_exhaustive]
174pub struct SetMatch {
175    pub start: usize,
176    pub end: usize,
177    pub pattern_idx: usize,
178}
179
180/// Failure modes during construction.
181#[derive(Debug, Error)]
182#[non_exhaustive]
183pub enum BuildError {
184    /// The pattern is empty.
185    #[error(
186        "strmatch: empty pattern\n  \
187         hint: an empty pattern matches every position; pass a non-empty \
188         literal or wrap the matcher in Option for an absent-matcher slot"
189    )]
190    Empty,
191
192    /// The regex parser rejected the pattern.
193    #[error(
194        "strmatch: regex syntax error in pattern {pattern:?}\n  \
195         reason: {source}\n  \
196         hint: {hint}"
197    )]
198    Syntax {
199        pattern: String,
200        #[source]
201        source: Box<regex_syntax::Error>,
202        hint: &'static str,
203    },
204
205    /// The pattern compiled to a tier below the builder's minimum.
206    #[error(
207        "strmatch: pattern {pattern:?} compiles to tier {got}, but builder \
208         requires at least {wanted}\n  \
209         reason: {reason}\n  \
210         hint: {hint}"
211    )]
212    TierTooLow {
213        pattern: String,
214        wanted: MatcherTier,
215        got: MatcherTier,
216        reason: &'static str,
217        hint: &'static str,
218    },
219}
220
221/// What to do when a pattern's classification falls below the
222/// builder's `min_tier`.
223#[non_exhaustive]
224#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
225pub enum OnBelowMin {
226    /// Default -- proceed quietly. The anti-spam protocol still emits
227    /// up to ten WARNs per process for fall-through patterns.
228    #[default]
229    Allow,
230    /// Proceed, but force an immediate WARN that bypasses the anti-spam
231    /// cap. Use when the caller explicitly wants every fall-through
232    /// logged regardless of process-wide dedup state.
233    Warn,
234    /// Refuse to build. Returns [`BuildError::TierTooLow`].
235    Reject,
236}
237
238// ---------------------------------------------------------------------------
239// StrMatcher
240// ---------------------------------------------------------------------------
241
242/// Compiled pattern with tier-aware dispatch.
243pub struct StrMatcher {
244    plan: Plan,
245    tier: MatcherTier,
246    pattern: String,
247    /// Short machine-readable reason (e.g. `"shape:starts-with"`,
248    /// `"literal-only:alternation"`, `"unbounded-quantifier"`).
249    reason: &'static str,
250}
251
252impl std::fmt::Debug for StrMatcher {
253    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254        f.debug_struct("StrMatcher")
255            .field("tier", &self.tier)
256            .field("pattern", &self.pattern)
257            .field("reason", &self.reason)
258            .finish_non_exhaustive()
259    }
260}
261
262impl StrMatcher {
263    /// Compile `pattern` with default options (no minimum tier, no
264    /// case-insensitivity).
265    ///
266    /// # Errors
267    ///
268    /// Returns `Err` if the pattern is empty or fails to parse. See
269    /// [`BuildError`] for the structured variants.
270    pub fn new(pattern: &str) -> Result<Self, BuildError> {
271        Self::builder().build(pattern)
272    }
273
274    /// Start a builder.
275    #[must_use]
276    pub fn builder() -> StrMatcherBuilder {
277        StrMatcherBuilder::new()
278    }
279
280    /// `true` if the haystack contains a match.
281    #[inline]
282    #[must_use]
283    pub fn is_match(&self, hay: &[u8]) -> bool {
284        self.plan.is_match(hay)
285    }
286
287    /// Find the first match, returning byte offsets.
288    #[inline]
289    #[must_use]
290    pub fn find(&self, hay: &[u8]) -> Option<Match> {
291        self.plan.find(hay)
292    }
293
294    /// Collect every non-overlapping match in iteration order.
295    ///
296    /// Returns an owned iterator over `Match`. The implementation
297    /// eagerly populates a `Vec` and returns its `IntoIter`; this
298    /// costs one allocation per call but amortises well over typical
299    /// scrubber workloads where each haystack yields 0-3 matches.
300    ///
301    /// For the anchored shapes (`StartsWith`, `EndsWith`,
302    /// `ExactMatch`) the iterator yields at most one match.
303    #[must_use]
304    pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<Match> {
305        let mut out = Vec::new();
306        self.plan.collect_matches(hay, &mut out);
307        out.into_iter()
308    }
309
310    /// The tier this matcher dispatches to. Useful for telemetry and
311    /// assertions in tests.
312    #[must_use]
313    pub fn tier(&self) -> MatcherTier {
314        self.tier
315    }
316
317    /// The original pattern string passed to [`Self::new`] (or the
318    /// builder).
319    #[must_use]
320    pub fn pattern(&self) -> &str {
321        &self.pattern
322    }
323
324    /// Short machine-readable reason for the tier choice. For Shape /
325    /// Literal this is a "success" reason (e.g. `"shape:starts-with"`).
326    /// For Meta this names the disqualifying feature
327    /// (`"unbounded-quantifier"`, `"word-boundary"`, …).
328    #[must_use]
329    pub fn reason(&self) -> &'static str {
330        self.reason
331    }
332}
333
334// ---------------------------------------------------------------------------
335// StrMatcherBuilder
336// ---------------------------------------------------------------------------
337
338/// Builder for [`StrMatcher`]. Carries minimum-tier policy and the
339/// case-insensitivity flag.
340#[derive(Debug, Clone, Default)]
341pub struct StrMatcherBuilder {
342    min_tier: Option<MatcherTier>,
343    on_below_min: OnBelowMin,
344    ascii_case_insensitive: bool,
345}
346
347impl StrMatcherBuilder {
348    /// Create a builder with default options (no minimum tier, no
349    /// case-insensitivity).
350    #[must_use]
351    pub fn new() -> Self {
352        Self::default()
353    }
354
355    /// Reject (or warn about) patterns that classify below this tier.
356    ///
357    /// Tier ordering (highest → lowest): `Shape > Literal > Meta`.
358    /// Setting `min_tier(Literal)` allows Shape and Literal but
359    /// triggers `on_below_min` for Meta.
360    #[must_use]
361    pub fn min_tier(mut self, tier: MatcherTier) -> Self {
362        self.min_tier = Some(tier);
363        self
364    }
365
366    /// What to do when a pattern falls below `min_tier`.
367    #[must_use]
368    pub fn on_below_min(mut self, policy: OnBelowMin) -> Self {
369        self.on_below_min = policy;
370        self
371    }
372
373    /// Build a case-insensitive matcher.
374    ///
375    /// The pattern is wrapped in `(?i:...)` before parsing. Both the
376    /// literal and shape tiers honour the flag via
377    /// `aho_corasick::AhoCorasickBuilder::ascii_case_insensitive`;
378    /// shape-tier dispatch downgrades to the literal tier when this
379    /// flag is set, since `memchr` does not fold case.
380    #[must_use]
381    pub fn ascii_case_insensitive(mut self, enabled: bool) -> Self {
382        self.ascii_case_insensitive = enabled;
383        self
384    }
385
386    /// Build the matcher.
387    ///
388    /// # Errors
389    ///
390    /// Returns `Err` if the pattern fails to parse or violates the
391    /// `min_tier` policy with `on_below_min = Reject`.
392    pub fn build(&self, pattern: &str) -> Result<StrMatcher, BuildError> {
393        let classified = classify::classify(pattern, self.ascii_case_insensitive)?;
394        let tier = classified.descriptor.tier;
395
396        if let Some(min) = self.min_tier
397            && tier.rank() < min.rank()
398        {
399            return self.apply_below_min(pattern, min, classified);
400        }
401
402        if tier == MatcherTier::Regex {
403            warn::on_regex_fallback(
404                pattern,
405                classified.descriptor.reason,
406                classified.descriptor.hint,
407                /*force=*/ false,
408            );
409            metrics_inc_fallback();
410        }
411
412        Ok(StrMatcher {
413            plan: classified.plan,
414            tier,
415            pattern: pattern.to_string(),
416            reason: classified.descriptor.reason,
417        })
418    }
419
420    fn apply_below_min(
421        &self,
422        pattern: &str,
423        wanted: MatcherTier,
424        classified: Classified,
425    ) -> Result<StrMatcher, BuildError> {
426        let got = classified.descriptor.tier;
427        let reason = classified.descriptor.reason;
428        let hint = classified.descriptor.hint;
429
430        match self.on_below_min {
431            OnBelowMin::Reject => Err(BuildError::TierTooLow {
432                pattern: pattern.to_string(),
433                wanted,
434                got,
435                reason,
436                hint,
437            }),
438            OnBelowMin::Warn => {
439                warn::on_regex_fallback(pattern, reason, hint, /*force=*/ true);
440                metrics_inc_fallback();
441                Ok(StrMatcher {
442                    plan: classified.plan,
443                    tier: got,
444                    pattern: pattern.to_string(),
445                    reason,
446                })
447            }
448            OnBelowMin::Allow => {
449                if got == MatcherTier::Regex {
450                    warn::on_regex_fallback(pattern, reason, hint, /*force=*/ false);
451                    metrics_inc_fallback();
452                }
453                Ok(StrMatcher {
454                    plan: classified.plan,
455                    tier: got,
456                    pattern: pattern.to_string(),
457                    reason,
458                })
459            }
460        }
461    }
462}
463
464// ---------------------------------------------------------------------------
465// StrMatcherSet -- multi-pattern, single AC scan where possible
466// ---------------------------------------------------------------------------
467
468/// Multi-pattern matcher.
469///
470/// Build-time partition:
471///
472/// - Unanchored `Byte` / `Literal` / `LiteralSet` patterns fold into
473///   one shared aho-corasick. One linear scan covers them all --
474///   O(M + total_matches).
475/// - Anchored patterns (`^foo`, `bar$`, `^baz$`) and `Regex` fall-
476///   throughs stay as per-pattern matchers.
477///
478/// The merged AC runs `MatchKind::LeftmostLongest` -- overlapping
479/// literals like `AKIA` vs `AKIA1234` resolve to the longer match.
480/// Pattern indices survive the merge: `SetMatch.pattern_idx`
481/// always matches the caller's input order.
482pub struct StrMatcherSet {
483    merged: Option<MergedAc>,
484    /// Anchored / regex-tier patterns, paired with input index.
485    individual: Vec<(usize, StrMatcher)>,
486    /// Tier per input pattern, in input order. Used for `tier_counts`.
487    tiers: Vec<MatcherTier>,
488}
489
490/// Cross-pattern AC. `pattern_indices[ac_id]` maps an AC pattern ID
491/// back to the caller's input index -- one input pattern can
492/// contribute multiple literals (alternation, byte-set), so indices
493/// repeat.
494struct MergedAc {
495    ac: aho_corasick::AhoCorasick,
496    pattern_indices: Vec<usize>,
497}
498
499impl StrMatcherSet {
500    /// Compile every pattern. Patterns that violate the builder's
501    /// `min_tier` policy with `OnBelowMin::Reject` fail the build for
502    /// the entire set -- returning the first failing pattern's error.
503    ///
504    /// # Errors
505    ///
506    /// As [`StrMatcherBuilder::build`].
507    pub fn new<I, S>(patterns: I) -> Result<Self, BuildError>
508    where
509        I: IntoIterator<Item = S>,
510        S: AsRef<str>,
511    {
512        Self::builder().build_set(patterns)
513    }
514
515    /// Start a builder.
516    #[must_use]
517    pub fn builder() -> StrMatcherBuilder {
518        StrMatcherBuilder::new()
519    }
520
521    /// `true` if any pattern matches anywhere in the haystack.
522    #[must_use]
523    pub fn is_match(&self, hay: &[u8]) -> bool {
524        if let Some(m) = &self.merged
525            && m.ac.find(hay).is_some()
526        {
527            return true;
528        }
529        self.individual.iter().any(|(_, m)| m.is_match(hay))
530    }
531
532    /// Find the earliest match across all patterns.
533    ///
534    /// Order:
535    /// - Lower `start` wins.
536    /// - At the same `start`, the merged AC applies
537    ///   `MatchKind::LeftmostLongest` *within* its literal set, so
538    ///   the longer literal wins among merged patterns (e.g.
539    ///   `AKIA1234` over `AKIA`).
540    /// - Across the merge boundary (merged-AC hit vs an individual
541    ///   anchored / regex matcher at the same `start`), the
542    ///   lower `pattern_idx` wins -- input-order preserved.
543    #[must_use]
544    pub fn earliest_match(&self, hay: &[u8]) -> Option<SetMatch> {
545        let mut best: Option<SetMatch> = None;
546        if let Some(m) = &self.merged
547            && let Some(hit) = m.ac.find(hay)
548        {
549            best = Some(SetMatch {
550                start: hit.start(),
551                end: hit.end(),
552                pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
553            });
554        }
555        for (idx, matcher) in &self.individual {
556            if let Some(found) = matcher.find(hay) {
557                let cand = SetMatch {
558                    start: found.start,
559                    end: found.end,
560                    pattern_idx: *idx,
561                };
562                best = match best {
563                    None => Some(cand),
564                    Some(b) if cand.start < b.start => Some(cand),
565                    Some(b) if cand.start == b.start && cand.pattern_idx < b.pattern_idx => {
566                        Some(cand)
567                    }
568                    Some(b) => Some(b),
569                };
570            }
571        }
572        best
573    }
574
575    /// Collect every non-overlapping match across all patterns,
576    /// sorted by `(start, pattern_idx)`.
577    ///
578    /// The merged AC runs `MatchKind::LeftmostLongest` internally,
579    /// so when two literals overlap within the merged set, the
580    /// longer one wins. Across the merge boundary the sort key
581    /// gives input-order tie-breaking.
582    ///
583    /// When multiple patterns match at the same position, the
584    /// lower-indexed pattern wins (consistent with the input order).
585    /// Matches across different patterns may overlap.
586    #[must_use]
587    pub fn find_iter(&self, hay: &[u8]) -> std::vec::IntoIter<SetMatch> {
588        let mut all: Vec<SetMatch> = Vec::new();
589        if let Some(m) = &self.merged {
590            for hit in m.ac.find_iter(hay) {
591                all.push(SetMatch {
592                    start: hit.start(),
593                    end: hit.end(),
594                    pattern_idx: m.pattern_indices[hit.pattern().as_usize()],
595                });
596            }
597        }
598        for (idx, matcher) in &self.individual {
599            for hit in matcher.find_iter(hay) {
600                all.push(SetMatch {
601                    start: hit.start,
602                    end: hit.end,
603                    pattern_idx: *idx,
604                });
605            }
606        }
607        all.sort_by_key(|m| (m.start, m.pattern_idx));
608        all.into_iter()
609    }
610
611    /// Per-tier population counts: `[Byte, Literal, LiteralSet, Regex]`.
612    /// Useful for operator dashboards / quality gates.
613    #[must_use]
614    pub fn tier_counts(&self) -> [usize; 4] {
615        let mut counts = [0_usize; 4];
616        for t in &self.tiers {
617            let idx = match t {
618                MatcherTier::Byte => 0,
619                MatcherTier::Literal => 1,
620                MatcherTier::LiteralSet => 2,
621                MatcherTier::Regex => 3,
622            };
623            counts[idx] += 1;
624        }
625        counts
626    }
627
628    /// Number of patterns in the set.
629    #[must_use]
630    pub fn len(&self) -> usize {
631        self.tiers.len()
632    }
633
634    /// `true` if the set has no patterns.
635    #[must_use]
636    pub fn is_empty(&self) -> bool {
637        self.tiers.is_empty()
638    }
639}
640
641impl StrMatcherBuilder {
642    /// Build a [`StrMatcherSet`] from an iterator of patterns.
643    ///
644    /// `min_tier` + `on_below_min` apply per pattern. First failure
645    /// short-circuits the build. Unanchored literals fold into a
646    /// merged AC; anchored / regex patterns stay individual.
647    ///
648    /// # Errors
649    ///
650    /// As [`StrMatcherBuilder::build`].
651    pub fn build_set<I, S>(&self, patterns: I) -> Result<StrMatcherSet, BuildError>
652    where
653        I: IntoIterator<Item = S>,
654        S: AsRef<str>,
655    {
656        let mut individual: Vec<(usize, StrMatcher)> = Vec::new();
657        let mut merge_lits: Vec<Vec<u8>> = Vec::new();
658        let mut pattern_indices: Vec<usize> = Vec::new();
659        let mut tiers: Vec<MatcherTier> = Vec::new();
660
661        for (idx, pat) in patterns.into_iter().enumerate() {
662            let m = self.build(pat.as_ref())?;
663            tiers.push(m.tier);
664            match m.plan.merge_literals() {
665                Some((lits, ci)) if ci == self.ascii_case_insensitive => {
666                    for lit in lits {
667                        pattern_indices.push(idx);
668                        merge_lits.push(lit);
669                    }
670                }
671                _ => individual.push((idx, m)),
672            }
673        }
674
675        // AC build is infallible on a pre-validated literal set --
676        // empty merge_lits short-circuits to None above, and the
677        // classifier already enforces LITERAL_SET_CAP. A failure
678        // here would mean a contract violation, so panic.
679        let merged = (!merge_lits.is_empty()).then(|| {
680            let ac = aho_corasick::AhoCorasickBuilder::new()
681                .ascii_case_insensitive(self.ascii_case_insensitive)
682                .match_kind(aho_corasick::MatchKind::LeftmostLongest)
683                .build(&merge_lits)
684                .expect("strmatch: merged AC build failed on a pre-validated literal set");
685            MergedAc {
686                ac,
687                pattern_indices,
688            }
689        });
690
691        Ok(StrMatcherSet {
692            merged,
693            individual,
694            tiers,
695        })
696    }
697}
698
699// The inherent single-pattern build method already exists above; the
700// trait-style impl below disambiguates against the iterator version.
701// Rust resolves single-pattern `&str` to the inherent method because
702// `IntoIterator<Item = S: AsRef<str>>` is not implemented for `&str`
703// (a `&str` doesn't iter into `&str`s).
704
705// ---------------------------------------------------------------------------
706// Anti-spam log helper
707// ---------------------------------------------------------------------------
708
709mod warn {
710    use super::{AtomicBool, AtomicUsize, HashSet, LazyLock, Mutex, Ordering};
711
712    /// Soft cap on distinct WARN-level fall-back log lines per process.
713    /// Past this we step down to DEBUG and emit one INFO summary.
714    pub(super) const WARN_CAP: usize = 10;
715
716    static WARNED_HASHES: LazyLock<Mutex<HashSet<u64>>> =
717        LazyLock::new(|| Mutex::new(HashSet::new()));
718    static DISTINCT: AtomicUsize = AtomicUsize::new(0);
719    static SUMMARY_EMITTED: AtomicBool = AtomicBool::new(false);
720
721    fn hash_pattern(pattern: &str) -> u64 {
722        use std::collections::hash_map::DefaultHasher;
723        use std::hash::{Hash, Hasher};
724        let mut h = DefaultHasher::new();
725        pattern.hash(&mut h);
726        h.finish()
727    }
728
729    /// Run the dedup + cap protocol. `force = true` always emits a
730    /// WARN (used by `OnBelowMin::Warn` callers who explicitly want
731    /// the log line).
732    pub(super) fn on_regex_fallback(
733        pattern: &str,
734        reason: &'static str,
735        hint: &'static str,
736        force: bool,
737    ) {
738        let h = hash_pattern(pattern);
739        let already_warned = {
740            let mut set = WARNED_HASHES.lock().unwrap_or_else(|e| e.into_inner());
741            !set.insert(h)
742        };
743
744        if already_warned && !force {
745            // Already warned about this pattern; respect the dedup.
746            return;
747        }
748
749        let n = if force {
750            // Force path doesn't count toward the cap.
751            DISTINCT.load(Ordering::Relaxed)
752        } else {
753            DISTINCT.fetch_add(1, Ordering::Relaxed) + 1
754        };
755
756        if force || n <= WARN_CAP {
757            tracing::warn!(
758                target: "hyperi_rustlib::strmatch",
759                pattern,
760                reason,
761                hint,
762                "pattern falls through to regex engine on hot path"
763            );
764        } else {
765            tracing::debug!(
766                target: "hyperi_rustlib::strmatch",
767                pattern,
768                reason,
769                hint,
770                "regex fallback (WARN suppressed past cap)"
771            );
772        }
773
774        if !force && n == WARN_CAP + 1 && !SUMMARY_EMITTED.swap(true, Ordering::Relaxed) {
775            tracing::info!(
776                target: "hyperi_rustlib::strmatch",
777                cap = WARN_CAP,
778                "{}+ distinct patterns have fallen through to the regex engine; \
779                 further fall-throughs log at DEBUG. Inspect StrMatcher::tier() / \
780                 StrMatcherSet::tier_counts() at runtime, or scrape the \
781                 hyperi_strmatch_regex_fallback_total metric.",
782                WARN_CAP,
783            );
784        }
785    }
786
787    /// Reset state for tests. Symbol exists only under `cfg(test)` so
788    /// production callers can't accidentally use it.
789    #[cfg(test)]
790    pub(super) fn reset_for_tests() {
791        WARNED_HASHES
792            .lock()
793            .unwrap_or_else(|e| e.into_inner())
794            .clear();
795        DISTINCT.store(0, Ordering::Relaxed);
796        SUMMARY_EMITTED.store(false, Ordering::Relaxed);
797    }
798}
799
800#[inline]
801fn metrics_inc_fallback() {
802    #[cfg(feature = "metrics")]
803    metrics::counter!("hyperi_strmatch_regex_fallback_total").increment(1);
804}
805
806// Re-export the warn-state reset for integration tests inside the
807// crate. Hidden from doc and not part of the public API.
808#[cfg(test)]
809#[doc(hidden)]
810pub fn reset_warn_state_for_tests() {
811    warn::reset_for_tests();
812}
813
814// ---------------------------------------------------------------------------
815// Tests
816// ---------------------------------------------------------------------------
817
818#[cfg(test)]
819mod tests;