sensitive_rs/engine/
mod.rs

1pub(crate) mod wumanber;
2use crate::WuManber;
3use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
4use regex::Regex;
5use std::sync::Arc;
6
7/// Supported matching algorithm types
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum MatchAlgorithm {
10    AhoCorasick, // Default algorithm, suitable for medium-sized vocabulary
11    WuManber,    // Suitable for large-scale thesaurus
12    Regex,       // Suitable for complex rule matching
13}
14
15/// Multi-pattern matching engine
16pub struct MultiPatternEngine {
17    algorithm: MatchAlgorithm,    // The matching algorithm currently used
18    ac: Option<Arc<AhoCorasick>>, // Aho-Corasick Engine
19    wm: Option<Arc<WuManber>>,    // Wu-Manber Engine
20    regex_set: Option<Regex>,     // Regular Expression Engine
21    patterns: Vec<String>,        // Store all modes
22}
23
24impl Default for MultiPatternEngine {
25    fn default() -> Self {
26        Self { algorithm: MatchAlgorithm::AhoCorasick, ac: None, wm: None, regex_set: None, patterns: Vec::new() }
27    }
28}
29
30impl MultiPatternEngine {
31    /// Create a new engine and automatically select the algorithm based on the lexicon size
32    pub fn new(algorithm: Option<MatchAlgorithm>, patterns: &[String]) -> Self {
33        let algorithm = algorithm.unwrap_or_else(|| Self::recommend_algorithm(patterns.len()));
34        let mut engine = Self { algorithm, ..Default::default() };
35
36        engine.rebuild(patterns);
37        engine
38    }
39
40    /// Rebuild the engine (called when the pattern is updated)
41    pub fn rebuild(&mut self, patterns: &[String]) {
42        self.patterns = patterns.to_vec();
43
44        // Reevaluate algorithm selection based on new thesaurus size
45        let recommended = Self::recommend_algorithm(patterns.len());
46        if self.algorithm != recommended {
47            self.algorithm = recommended;
48        }
49
50        self.build_engines();
51    }
52
53    /// Recommended algorithm based on the lexicon
54    pub fn recommend_algorithm(word_count: usize) -> MatchAlgorithm {
55        match word_count {
56            0..=100 => MatchAlgorithm::WuManber,         // Small thesaurus for Wu-Manber
57            101..=10_000 => MatchAlgorithm::AhoCorasick, // Aho-Corasick for medium thesaurus
58            _ => MatchAlgorithm::Regex,                  // Use rules for super large thesaurus
59        }
60    }
61
62    /// Force rebuild using the specified algorithm
63    pub fn rebuild_with_algorithm(&mut self, patterns: &[String], algorithm: MatchAlgorithm) {
64        self.patterns = patterns.to_vec();
65        self.algorithm = algorithm;
66        self.build_engines();
67    }
68
69    /// Build the corresponding engine according to the current algorithm
70    fn build_engines(&mut self) {
71        // Clear all engines
72        self.ac = None;
73        self.wm = None;
74        self.regex_set = None;
75
76        // Build the corresponding engine according to the selected algorithm
77        match self.algorithm {
78            MatchAlgorithm::AhoCorasick => {
79                if !self.patterns.is_empty() {
80                    match AhoCorasickBuilder::new()
81                        .match_kind(aho_corasick::MatchKind::LeftmostLongest)
82                        .build(&self.patterns)
83                    {
84                        Ok(ac) => self.ac = Some(Arc::new(ac)),
85                        Err(_) => {
86                            // Fallback to WuManber if AhoCorasick build fails
87                            self.algorithm = MatchAlgorithm::WuManber;
88                            self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
89                        }
90                    }
91                }
92            }
93            MatchAlgorithm::WuManber => {
94                if !self.patterns.is_empty() {
95                    self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
96                }
97            }
98            MatchAlgorithm::Regex => {
99                if !self.patterns.is_empty() {
100                    let escaped_patterns: Vec<String> = self.patterns.iter().map(|p| regex::escape(p)).collect();
101                    let pattern = escaped_patterns.join("|");
102
103                    match Regex::new(&pattern) {
104                        Ok(regex) => self.regex_set = Some(regex),
105                        Err(_) => {
106                            // Fallback to WuManber if Regex build fails
107                            self.algorithm = MatchAlgorithm::WuManber;
108                            self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
109                        }
110                    }
111                }
112            }
113        }
114    }
115
116    /// Get the currently used algorithm
117    pub fn current_algorithm(&self) -> MatchAlgorithm {
118        self.algorithm
119    }
120
121    /// Get all modes
122    pub fn get_patterns(&self) -> &[String] {
123        &self.patterns
124    }
125
126    /// Find the first match
127    pub fn find_first(&self, text: &str) -> Option<String> {
128        match self.algorithm {
129            MatchAlgorithm::AhoCorasick => {
130                self.ac.as_ref()?.find(text).map(|mat| text[mat.start()..mat.end()].to_string())
131            }
132            MatchAlgorithm::WuManber => {
133                // Use the search_string method to return directly to String
134                self.wm.as_ref()?.search_string(text)
135            }
136            MatchAlgorithm::Regex => self.regex_set.as_ref()?.find(text).map(|mat| mat.as_str().to_string()),
137        }
138    }
139
140    /// Replace all matches with optimized performance
141    pub fn replace_all(&self, text: &str, replacement: &str) -> String {
142        match self.algorithm {
143            MatchAlgorithm::AhoCorasick => {
144                if let Some(ac) = &self.ac {
145                    ac.replace_all(text, &[replacement]).to_string()
146                } else {
147                    text.to_string()
148                }
149            }
150            MatchAlgorithm::WuManber => {
151                if let Some(wm) = &self.wm {
152                    if replacement.is_empty() {
153                        wm.remove_all(text)
154                    } else {
155                        let repl_char = replacement.chars().next().unwrap_or('*');
156                        wm.replace_all(text, repl_char)
157                    }
158                } else {
159                    text.to_string()
160                }
161            }
162            MatchAlgorithm::Regex => {
163                if let Some(regex) = &self.regex_set {
164                    regex.replace_all(text, replacement).to_string()
165                } else {
166                    text.to_string()
167                }
168            }
169        }
170    }
171
172    /// Find all matches
173    pub fn find_all(&self, text: &str) -> Vec<String> {
174        match self.algorithm {
175            MatchAlgorithm::AhoCorasick => {
176                if let Some(ac) = &self.ac {
177                    ac.find_iter(text).map(|mat| text[mat.start()..mat.end()].to_string()).collect()
178                } else {
179                    Vec::new()
180                }
181            }
182            MatchAlgorithm::WuManber => {
183                if let Some(wm) = &self.wm {
184                    wm.search_all_strings(text)
185                } else {
186                    Vec::new()
187                }
188            }
189            MatchAlgorithm::Regex => {
190                if let Some(regex) = &self.regex_set {
191                    regex.find_iter(text).map(|mat| mat.as_str().to_string()).collect()
192                } else {
193                    Vec::new()
194                }
195            }
196        }
197    }
198
199    /// Get detailed match information
200    pub fn find_matches_with_positions(&self, text: &str) -> Vec<MatchInfo> {
201        match self.algorithm {
202            MatchAlgorithm::AhoCorasick => {
203                if let Some(ac) = &self.ac {
204                    ac.find_iter(text)
205                        .map(|mat| MatchInfo {
206                            pattern: text[mat.start()..mat.end()].to_string(),
207                            start: mat.start(),
208                            end: mat.end(),
209                        })
210                        .collect()
211                } else {
212                    Vec::new()
213                }
214            }
215            MatchAlgorithm::WuManber => {
216                if let Some(wm) = &self.wm {
217                    wm.find_matches(text)
218                        .into_iter()
219                        .filter_map(|m| {
220                            let pattern = text.get(m.start..m.end)?;
221                            Some(MatchInfo { pattern: pattern.to_string(), start: m.start, end: m.end })
222                        })
223                        .collect()
224                } else {
225                    Vec::new()
226                }
227            }
228            MatchAlgorithm::Regex => {
229                if let Some(regex) = &self.regex_set {
230                    regex
231                        .find_iter(text)
232                        .map(|mat| MatchInfo { pattern: mat.as_str().to_string(), start: mat.start(), end: mat.end() })
233                        .collect()
234                } else {
235                    Vec::new()
236                }
237            }
238        }
239    }
240
241    /// Check if text contains any patterns
242    pub fn contains_any(&self, text: &str) -> bool {
243        self.find_first(text).is_some()
244    }
245
246    /// Get engine statistics
247    pub fn stats(&self) -> EngineStats {
248        EngineStats {
249            algorithm: self.algorithm,
250            pattern_count: self.patterns.len(),
251            memory_usage: self.estimate_memory_usage(),
252        }
253    }
254
255    /// Estimate memory usage
256    fn estimate_memory_usage(&self) -> usize {
257        let patterns_memory = self.patterns.iter().map(|p| p.len()).sum::<usize>();
258
259        let engine_memory = match self.algorithm {
260            MatchAlgorithm::WuManber => {
261                if let Some(wm) = &self.wm {
262                    wm.memory_stats().total_memory
263                } else {
264                    0
265                }
266            }
267            _ => patterns_memory * 2, // Rough estimate for other algorithms
268        };
269
270        patterns_memory + engine_memory
271    }
272}
273
274/// Match information with position details
275#[derive(Debug, Clone)]
276pub struct MatchInfo {
277    pub pattern: String,
278    pub start: usize,
279    pub end: usize,
280}
281
282/// Engine statistics
283#[derive(Debug, Clone)]
284pub struct EngineStats {
285    pub algorithm: MatchAlgorithm,
286    pub pattern_count: usize,
287    pub memory_usage: usize,
288}