Skip to main content

multimatch/
pattern.rs

1//! Pattern definitions and the builder for compiling pattern sets.
2
3use crate::engine::MatchEngine;
4use crate::MatchError;
5
6/// A single pattern to match against.
7#[derive(Debug, Clone)]
8pub struct PatternDef {
9    /// User-assigned ID for this pattern (returned in match results).
10    pub id: usize,
11    /// The pattern content.
12    pub kind: PatternKind,
13    /// Whether matching is case-insensitive.
14    pub case_insensitive: bool,
15}
16
17/// Whether a pattern is a literal string or a regex.
18#[derive(Debug, Clone)]
19pub enum PatternKind {
20    /// Exact literal match (uses Aho-Corasick).
21    Literal(String),
22    /// Regex match (uses regex crate or Hyperscan).
23    Regex(String),
24}
25
26/// A compiled set of patterns ready for scanning.
27pub struct PatternSet {
28    engine: MatchEngine,
29    pattern_count: usize,
30}
31
32impl crate::Scanner for PatternSet {
33    fn scan(&self, input: &[u8]) -> Vec<crate::MatchResult> {
34        self.engine.scan(input)
35    }
36
37    fn is_match(&self, input: &[u8]) -> bool {
38        self.engine.is_match(input)
39    }
40
41    fn pattern_count(&self) -> usize {
42        self.pattern_count
43    }
44}
45
46impl PatternSet {
47    /// Start building a new pattern set.
48    pub fn builder() -> PatternSetBuilder {
49        PatternSetBuilder::new()
50    }
51
52    /// Scan a string for all matching patterns.
53    pub fn scan_str(&self, input: &str) -> Vec<crate::MatchResult> {
54        crate::Scanner::scan(self, input.as_bytes())
55    }
56}
57
58/// Builder for constructing a [`PatternSet`].
59pub struct PatternSetBuilder {
60    patterns: Vec<PatternDef>,
61}
62
63impl PatternSetBuilder {
64    /// Create a new empty builder.
65    pub fn new() -> Self {
66        Self {
67            patterns: Vec::new(),
68        }
69    }
70
71    /// Add a literal pattern (exact string match).
72    pub fn add_literal(mut self, literal: &str, id: usize) -> Self {
73        self.patterns.push(PatternDef {
74            id,
75            kind: PatternKind::Literal(literal.to_string()),
76            case_insensitive: false,
77        });
78        self
79    }
80
81    /// Add a case-insensitive literal pattern.
82    pub fn add_literal_ci(mut self, literal: &str, id: usize) -> Self {
83        self.patterns.push(PatternDef {
84            id,
85            kind: PatternKind::Literal(literal.to_string()),
86            case_insensitive: true,
87        });
88        self
89    }
90
91    /// Add a regex pattern.
92    pub fn add_regex(mut self, regex: &str, id: usize) -> Self {
93        self.patterns.push(PatternDef {
94            id,
95            kind: PatternKind::Regex(regex.to_string()),
96            case_insensitive: false,
97        });
98        self
99    }
100
101    /// Add a case-insensitive regex pattern.
102    pub fn add_regex_ci(mut self, regex: &str, id: usize) -> Self {
103        self.patterns.push(PatternDef {
104            id,
105            kind: PatternKind::Regex(regex.to_string()),
106            case_insensitive: true,
107        });
108        self
109    }
110
111    /// Add a raw PatternDef.
112    pub fn add(mut self, pattern: PatternDef) -> Self {
113        self.patterns.push(pattern);
114        self
115    }
116
117    /// Compile all patterns into a [`PatternSet`].
118    pub fn build(self) -> Result<PatternSet, MatchError> {
119        if self.patterns.is_empty() {
120            return Err(MatchError::Empty);
121        }
122        let count = self.patterns.len();
123        let engine = MatchEngine::compile(self.patterns)?;
124        Ok(PatternSet {
125            engine,
126            pattern_count: count,
127        })
128    }
129}
130
131impl Default for PatternSetBuilder {
132    fn default() -> Self {
133        Self::new()
134    }
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140    use crate::Scanner;
141
142    #[test]
143    fn builder_empty_fails() {
144        assert!(PatternSetBuilder::new().build().is_err());
145    }
146
147    #[test]
148    fn builder_literal() {
149        let ps = PatternSet::builder()
150            .add_literal("hello", 0)
151            .build()
152            .unwrap();
153        assert_eq!(ps.pattern_count(), 1);
154    }
155
156    #[test]
157    fn builder_regex() {
158        let ps = PatternSet::builder()
159            .add_regex(r"\d+", 0)
160            .build()
161            .unwrap();
162        assert_eq!(ps.pattern_count(), 1);
163    }
164
165    #[test]
166    fn builder_mixed() {
167        let ps = PatternSet::builder()
168            .add_literal("token", 0)
169            .add_regex(r"[A-Z]{5}", 1)
170            .add_literal_ci("SECRET", 2)
171            .build()
172            .unwrap();
173        assert_eq!(ps.pattern_count(), 3);
174    }
175
176    #[test]
177    fn edge_case_empty_literal() {
178        let ps = PatternSet::builder()
179            .add_literal("", 0)
180            .build()
181            .unwrap();
182        assert_eq!(ps.pattern_count(), 1);
183        let _res = ps.scan_str("test");
184        // Usually matches at every boundary, Aho-Corasick handles empty string depending on configuration.
185    }
186
187    #[test]
188    fn edge_case_large_pattern_count() {
189        let mut builder = PatternSet::builder();
190        for i in 0..100 {
191            builder = builder.add_literal(&format!("literal{}", i), i);
192        }
193        let ps = builder.build().unwrap();
194        assert_eq!(ps.pattern_count(), 100);
195    }
196
197    #[test]
198    fn edge_case_complex_regex() {
199        let ps = PatternSet::builder()
200            .add_regex(r"^(abc|def)*[0-9]+$", 99)
201            .build()
202            .unwrap();
203        assert_eq!(ps.pattern_count(), 1);
204    }
205
206    #[test]
207    fn edge_case_regex_ci_mixed() {
208        let ps = PatternSet::builder()
209            .add_regex_ci(r"[a-z]", 1)
210            .add_literal_ci("TeSt", 2)
211            .build()
212            .unwrap();
213        assert_eq!(ps.pattern_count(), 2);
214    }
215
216    #[test]
217    fn edge_case_multiple_same_id() {
218        let ps = PatternSet::builder()
219            .add_literal("foo", 10)
220            .add_literal("bar", 10)
221            .build()
222            .unwrap();
223        assert_eq!(ps.pattern_count(), 2);
224        let matches = ps.scan_str("foobar");
225        assert_eq!(matches.len(), 2);
226        assert_eq!(matches[0].pattern_id, 10);
227        assert_eq!(matches[1].pattern_id, 10);
228    }
229
230    #[test]
231    fn edge_case_scan_str_vs_bytes() {
232        let ps = PatternSet::builder()
233            .add_literal("rust", 1)
234            .build()
235            .unwrap();
236        let s = "learning rust is fun";
237        assert_eq!(ps.scan_str(s).len(), 1);
238        assert_eq!(ps.scan(s.as_bytes()).len(), 1);
239        assert!(ps.is_match(s.as_bytes()));
240    }
241
242    #[test]
243    fn edge_case_add_raw_pattern_def() {
244        let def = PatternDef {
245            id: 42,
246            kind: PatternKind::Literal("raw".to_string()),
247            case_insensitive: false,
248        };
249        let ps = PatternSet::builder().add(def).build().unwrap();
250        assert_eq!(ps.pattern_count(), 1);
251    }
252
253    #[test]
254    fn edge_case_special_chars_literal() {
255        let ps = PatternSet::builder()
256            .add_literal("!@#$%^&*()_+", 5)
257            .build()
258            .unwrap();
259        assert_eq!(ps.pattern_count(), 1);
260        let m = ps.scan_str("here is !@#$%^&*()_+ special");
261        assert_eq!(m.len(), 1);
262    }
263
264    #[test]
265    fn edge_case_special_chars_regex() {
266        let ps = PatternSet::builder()
267            .add_regex(r"\!\@#\$\%\^\&\*\(\)_\+", 5)
268            .build()
269            .unwrap();
270        assert_eq!(ps.pattern_count(), 1);
271        let m = ps.scan_str("here is !@#$%^&*()_+ special");
272        assert_eq!(m.len(), 1);
273    }
274
275    #[test]
276    fn edge_case_very_long_input_no_match() {
277        let ps = PatternSet::builder().add_literal("FINDME", 1).build().unwrap();
278        let input = "x".repeat(100_000);
279        assert!(!ps.is_match(input.as_bytes()));
280    }
281
282    #[test]
283    fn edge_case_default_builder() {
284        let builder = PatternSetBuilder::default();
285        assert!(builder.build().is_err());
286    }
287}