Skip to main content

sensitive_rs/engine/
mod.rs

1pub mod wumanber;
2use crate::WuManber;
3use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
4use regex::Regex;
5use std::sync::Arc;
6
7/// Supported matching algorithm types
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum MatchAlgorithm {
10    /// Best for medium-sized vocabulary (101-10,000 patterns)
11    /// Automaton-based, O(n) scan regardless of pattern count
12    AhoCorasick,
13    /// Best for small vocabulary (0-100 patterns)
14    /// Fast with few patterns: small tables, quick scan
15    WuManber,
16    /// Best for very large vocabulary (10,000+ patterns)
17    /// Pattern compilation overhead amortized over many patterns
18    Regex,
19}
20
21/// Multi-pattern matching engine
22pub struct MultiPatternEngine {
23    algorithm: MatchAlgorithm,    // The matching algorithm currently used
24    ac: Option<Arc<AhoCorasick>>, // Aho-Corasick Engine
25    wm: Option<Arc<WuManber>>,    // Wu-Manber Engine
26    regex_set: Option<Regex>,     // Regular Expression Engine
27    patterns: Vec<String>,        // Store all modes
28}
29
30impl std::fmt::Debug for MultiPatternEngine {
31    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
32        f.debug_struct("MultiPatternEngine")
33            .field("algorithm", &self.algorithm)
34            .field("pattern_count", &self.patterns.len())
35            .field("has_ac", &self.ac.is_some())
36            .field("has_wm", &self.wm.is_some())
37            .field("has_regex", &self.regex_set.is_some())
38            .finish()
39    }
40}
41
42impl Default for MultiPatternEngine {
43    fn default() -> Self {
44        Self { algorithm: MatchAlgorithm::AhoCorasick, ac: None, wm: None, regex_set: None, patterns: Vec::new() }
45    }
46}
47
48impl MultiPatternEngine {
49    /// Create a new engine and automatically select the algorithm based on the lexicon size
50    pub fn new(algorithm: Option<MatchAlgorithm>, patterns: &[String]) -> Self {
51        let algorithm = algorithm.unwrap_or_else(|| Self::recommend_algorithm(patterns.len()));
52        let mut engine = Self { algorithm, ..Default::default() };
53
54        engine.rebuild(patterns);
55        engine
56    }
57
58    /// Rebuild the engine (called when the pattern is updated)
59    pub fn rebuild(&mut self, patterns: &[String]) {
60        self.patterns = patterns.to_vec();
61
62        // Reevaluate algorithm selection based on new thesaurus size
63        let recommended = Self::recommend_algorithm(patterns.len());
64        if self.algorithm != recommended {
65            self.algorithm = recommended;
66        }
67
68        self.build_engines();
69    }
70
71    /// Recommended algorithm based on the lexicon size
72    ///
73    /// - 0-100 patterns: WuManber (few patterns = small tables, quick scan)
74    /// - 101-10,000 patterns: AhoCorasick (automaton-based, O(n) scan)
75    /// - 10,000+ patterns: Regex (compilation overhead amortized)
76    pub fn recommend_algorithm(word_count: usize) -> MatchAlgorithm {
77        match word_count {
78            0..=100 => MatchAlgorithm::WuManber,
79            101..=10_000 => MatchAlgorithm::AhoCorasick,
80            _ => MatchAlgorithm::Regex,
81        }
82    }
83
84    /// Force rebuild using the specified algorithm
85    pub fn rebuild_with_algorithm(&mut self, patterns: &[String], algorithm: MatchAlgorithm) {
86        self.patterns = patterns.to_vec();
87        self.algorithm = algorithm;
88        self.build_engines();
89    }
90
91    /// Build the corresponding engine according to the current algorithm
92    fn build_engines(&mut self) {
93        // Clear all engines
94        self.ac = None;
95        self.wm = None;
96        self.regex_set = None;
97
98        // Build the corresponding engine according to the selected algorithm
99        match self.algorithm {
100            MatchAlgorithm::AhoCorasick => {
101                if !self.patterns.is_empty() {
102                    match AhoCorasickBuilder::new()
103                        .match_kind(aho_corasick::MatchKind::LeftmostLongest)
104                        .build(&self.patterns)
105                    {
106                        Ok(ac) => self.ac = Some(Arc::new(ac)),
107                        Err(_) => {
108                            // Fallback to WuManber if AhoCorasick build fails
109                            self.algorithm = MatchAlgorithm::WuManber;
110                            self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
111                        }
112                    }
113                }
114            }
115            MatchAlgorithm::WuManber => {
116                if !self.patterns.is_empty() {
117                    self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
118                }
119            }
120            MatchAlgorithm::Regex => {
121                if !self.patterns.is_empty() {
122                    let escaped_patterns: Vec<String> = self.patterns.iter().map(|p| regex::escape(p)).collect();
123                    let pattern = escaped_patterns.join("|");
124
125                    match Regex::new(&pattern) {
126                        Ok(regex) => self.regex_set = Some(regex),
127                        Err(_) => {
128                            // Fallback to WuManber if Regex build fails
129                            self.algorithm = MatchAlgorithm::WuManber;
130                            self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
131                        }
132                    }
133                }
134            }
135        }
136    }
137
138    /// Get the currently used algorithm
139    pub fn current_algorithm(&self) -> MatchAlgorithm {
140        self.algorithm
141    }
142
143    /// Get all modes
144    pub fn get_patterns(&self) -> &[String] {
145        &self.patterns
146    }
147
148    /// Find the first match
149    pub fn find_first(&self, text: &str) -> Option<String> {
150        match self.algorithm {
151            MatchAlgorithm::AhoCorasick => {
152                self.ac.as_ref()?.find(text).map(|mat| text[mat.start()..mat.end()].to_string())
153            }
154            MatchAlgorithm::WuManber => {
155                // Use the search_string method to return directly to String
156                self.wm.as_ref()?.search_string(text)
157            }
158            MatchAlgorithm::Regex => self.regex_set.as_ref()?.find(text).map(|mat| mat.as_str().to_string()),
159        }
160    }
161
162    /// Replace all matches with optimized performance
163    pub fn replace_all(&self, text: &str, replacement: &str) -> String {
164        match self.algorithm {
165            MatchAlgorithm::AhoCorasick => {
166                if let Some(ac) = &self.ac {
167                    ac.replace_all(text, &[replacement]).to_string()
168                } else {
169                    text.to_string()
170                }
171            }
172            MatchAlgorithm::WuManber => {
173                if let Some(wm) = &self.wm {
174                    if replacement.is_empty() {
175                        wm.remove_all(text)
176                    } else {
177                        let repl_char = replacement.chars().next().unwrap_or('*');
178                        wm.replace_all(text, repl_char)
179                    }
180                } else {
181                    text.to_string()
182                }
183            }
184            MatchAlgorithm::Regex => {
185                if let Some(regex) = &self.regex_set {
186                    regex.replace_all(text, replacement).to_string()
187                } else {
188                    text.to_string()
189                }
190            }
191        }
192    }
193
194    /// Find all matches
195    pub fn find_all(&self, text: &str) -> Vec<String> {
196        match self.algorithm {
197            MatchAlgorithm::AhoCorasick => {
198                if let Some(ac) = &self.ac {
199                    ac.find_iter(text).map(|mat| text[mat.start()..mat.end()].to_string()).collect()
200                } else {
201                    Vec::new()
202                }
203            }
204            MatchAlgorithm::WuManber => {
205                if let Some(wm) = &self.wm {
206                    wm.search_all_strings(text)
207                } else {
208                    Vec::new()
209                }
210            }
211            MatchAlgorithm::Regex => {
212                if let Some(regex) = &self.regex_set {
213                    regex.find_iter(text).map(|mat| mat.as_str().to_string()).collect()
214                } else {
215                    Vec::new()
216                }
217            }
218        }
219    }
220
221    /// Get detailed match information
222    pub fn find_matches_with_positions(&self, text: &str) -> Vec<MatchInfo> {
223        match self.algorithm {
224            MatchAlgorithm::AhoCorasick => {
225                if let Some(ac) = &self.ac {
226                    ac.find_iter(text)
227                        .map(|mat| MatchInfo {
228                            pattern: text[mat.start()..mat.end()].to_string(),
229                            start: mat.start(),
230                            end: mat.end(),
231                        })
232                        .collect()
233                } else {
234                    Vec::new()
235                }
236            }
237            MatchAlgorithm::WuManber => {
238                if let Some(wm) = &self.wm {
239                    wm.find_matches(text)
240                        .into_iter()
241                        .filter_map(|m| {
242                            let pattern = text.get(m.start..m.end)?;
243                            Some(MatchInfo { pattern: pattern.to_string(), start: m.start, end: m.end })
244                        })
245                        .collect()
246                } else {
247                    Vec::new()
248                }
249            }
250            MatchAlgorithm::Regex => {
251                if let Some(regex) = &self.regex_set {
252                    regex
253                        .find_iter(text)
254                        .map(|mat| MatchInfo { pattern: mat.as_str().to_string(), start: mat.start(), end: mat.end() })
255                        .collect()
256                } else {
257                    Vec::new()
258                }
259            }
260        }
261    }
262
263    /// Check if text contains any patterns
264    pub fn contains_any(&self, text: &str) -> bool {
265        self.find_first(text).is_some()
266    }
267
268    /// Get engine statistics
269    pub fn stats(&self) -> EngineStats {
270        EngineStats {
271            algorithm: self.algorithm,
272            pattern_count: self.patterns.len(),
273            memory_usage: self.estimate_memory_usage(),
274        }
275    }
276
277    /// Estimate memory usage
278    fn estimate_memory_usage(&self) -> usize {
279        let patterns_memory = self.patterns.iter().map(|p| p.len()).sum::<usize>();
280
281        let engine_memory = match self.algorithm {
282            MatchAlgorithm::WuManber => {
283                if let Some(wm) = &self.wm {
284                    wm.memory_stats().total_memory
285                } else {
286                    0
287                }
288            }
289            _ => patterns_memory * 2, // Rough estimate for other algorithms
290        };
291
292        patterns_memory + engine_memory
293    }
294}
295
296/// Match information with position details
297#[derive(Debug, Clone)]
298pub struct MatchInfo {
299    pub pattern: String,
300    pub start: usize,
301    pub end: usize,
302}
303
304/// Engine statistics
305#[derive(Debug, Clone)]
306pub struct EngineStats {
307    pub algorithm: MatchAlgorithm,
308    pub pattern_count: usize,
309    pub memory_usage: usize,
310}