sensitive_rs/engine/
mod.rs1pub mod wumanber;
2use crate::WuManber;
3use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
4use regex::Regex;
5use std::sync::Arc;
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum MatchAlgorithm {
10 AhoCorasick,
13 WuManber,
16 Regex,
19}
20
21pub struct MultiPatternEngine {
23 algorithm: MatchAlgorithm, ac: Option<Arc<AhoCorasick>>, wm: Option<Arc<WuManber>>, regex_set: Option<Regex>, patterns: Vec<String>, }
29
30impl std::fmt::Debug for MultiPatternEngine {
31 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
32 f.debug_struct("MultiPatternEngine")
33 .field("algorithm", &self.algorithm)
34 .field("pattern_count", &self.patterns.len())
35 .field("has_ac", &self.ac.is_some())
36 .field("has_wm", &self.wm.is_some())
37 .field("has_regex", &self.regex_set.is_some())
38 .finish()
39 }
40}
41
42impl Default for MultiPatternEngine {
43 fn default() -> Self {
44 Self { algorithm: MatchAlgorithm::AhoCorasick, ac: None, wm: None, regex_set: None, patterns: Vec::new() }
45 }
46}
47
48impl MultiPatternEngine {
49 pub fn new(algorithm: Option<MatchAlgorithm>, patterns: &[String]) -> Self {
51 let algorithm = algorithm.unwrap_or_else(|| Self::recommend_algorithm(patterns.len()));
52 let mut engine = Self { algorithm, ..Default::default() };
53
54 engine.rebuild(patterns);
55 engine
56 }
57
58 pub fn rebuild(&mut self, patterns: &[String]) {
60 self.patterns = patterns.to_vec();
61
62 let recommended = Self::recommend_algorithm(patterns.len());
64 if self.algorithm != recommended {
65 self.algorithm = recommended;
66 }
67
68 self.build_engines();
69 }
70
71 pub fn recommend_algorithm(word_count: usize) -> MatchAlgorithm {
77 match word_count {
78 0..=100 => MatchAlgorithm::WuManber,
79 101..=10_000 => MatchAlgorithm::AhoCorasick,
80 _ => MatchAlgorithm::Regex,
81 }
82 }
83
84 pub fn rebuild_with_algorithm(&mut self, patterns: &[String], algorithm: MatchAlgorithm) {
86 self.patterns = patterns.to_vec();
87 self.algorithm = algorithm;
88 self.build_engines();
89 }
90
91 fn build_engines(&mut self) {
93 self.ac = None;
95 self.wm = None;
96 self.regex_set = None;
97
98 match self.algorithm {
100 MatchAlgorithm::AhoCorasick => {
101 if !self.patterns.is_empty() {
102 match AhoCorasickBuilder::new()
103 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
104 .build(&self.patterns)
105 {
106 Ok(ac) => self.ac = Some(Arc::new(ac)),
107 Err(_) => {
108 self.algorithm = MatchAlgorithm::WuManber;
110 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
111 }
112 }
113 }
114 }
115 MatchAlgorithm::WuManber => {
116 if !self.patterns.is_empty() {
117 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
118 }
119 }
120 MatchAlgorithm::Regex => {
121 if !self.patterns.is_empty() {
122 let escaped_patterns: Vec<String> = self.patterns.iter().map(|p| regex::escape(p)).collect();
123 let pattern = escaped_patterns.join("|");
124
125 match Regex::new(&pattern) {
126 Ok(regex) => self.regex_set = Some(regex),
127 Err(_) => {
128 self.algorithm = MatchAlgorithm::WuManber;
130 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
131 }
132 }
133 }
134 }
135 }
136 }
137
138 pub fn current_algorithm(&self) -> MatchAlgorithm {
140 self.algorithm
141 }
142
143 pub fn get_patterns(&self) -> &[String] {
145 &self.patterns
146 }
147
148 pub fn find_first(&self, text: &str) -> Option<String> {
150 match self.algorithm {
151 MatchAlgorithm::AhoCorasick => {
152 self.ac.as_ref()?.find(text).map(|mat| text[mat.start()..mat.end()].to_string())
153 }
154 MatchAlgorithm::WuManber => {
155 self.wm.as_ref()?.search_string(text)
157 }
158 MatchAlgorithm::Regex => self.regex_set.as_ref()?.find(text).map(|mat| mat.as_str().to_string()),
159 }
160 }
161
162 pub fn replace_all(&self, text: &str, replacement: &str) -> String {
164 match self.algorithm {
165 MatchAlgorithm::AhoCorasick => {
166 if let Some(ac) = &self.ac {
167 ac.replace_all(text, &[replacement]).to_string()
168 } else {
169 text.to_string()
170 }
171 }
172 MatchAlgorithm::WuManber => {
173 if let Some(wm) = &self.wm {
174 if replacement.is_empty() {
175 wm.remove_all(text)
176 } else {
177 let repl_char = replacement.chars().next().unwrap_or('*');
178 wm.replace_all(text, repl_char)
179 }
180 } else {
181 text.to_string()
182 }
183 }
184 MatchAlgorithm::Regex => {
185 if let Some(regex) = &self.regex_set {
186 regex.replace_all(text, replacement).to_string()
187 } else {
188 text.to_string()
189 }
190 }
191 }
192 }
193
194 pub fn find_all(&self, text: &str) -> Vec<String> {
196 match self.algorithm {
197 MatchAlgorithm::AhoCorasick => {
198 if let Some(ac) = &self.ac {
199 ac.find_iter(text).map(|mat| text[mat.start()..mat.end()].to_string()).collect()
200 } else {
201 Vec::new()
202 }
203 }
204 MatchAlgorithm::WuManber => {
205 if let Some(wm) = &self.wm {
206 wm.search_all_strings(text)
207 } else {
208 Vec::new()
209 }
210 }
211 MatchAlgorithm::Regex => {
212 if let Some(regex) = &self.regex_set {
213 regex.find_iter(text).map(|mat| mat.as_str().to_string()).collect()
214 } else {
215 Vec::new()
216 }
217 }
218 }
219 }
220
221 pub fn find_matches_with_positions(&self, text: &str) -> Vec<MatchInfo> {
223 match self.algorithm {
224 MatchAlgorithm::AhoCorasick => {
225 if let Some(ac) = &self.ac {
226 ac.find_iter(text)
227 .map(|mat| MatchInfo {
228 pattern: text[mat.start()..mat.end()].to_string(),
229 start: mat.start(),
230 end: mat.end(),
231 })
232 .collect()
233 } else {
234 Vec::new()
235 }
236 }
237 MatchAlgorithm::WuManber => {
238 if let Some(wm) = &self.wm {
239 wm.find_matches(text)
240 .into_iter()
241 .filter_map(|m| {
242 let pattern = text.get(m.start..m.end)?;
243 Some(MatchInfo { pattern: pattern.to_string(), start: m.start, end: m.end })
244 })
245 .collect()
246 } else {
247 Vec::new()
248 }
249 }
250 MatchAlgorithm::Regex => {
251 if let Some(regex) = &self.regex_set {
252 regex
253 .find_iter(text)
254 .map(|mat| MatchInfo { pattern: mat.as_str().to_string(), start: mat.start(), end: mat.end() })
255 .collect()
256 } else {
257 Vec::new()
258 }
259 }
260 }
261 }
262
263 pub fn contains_any(&self, text: &str) -> bool {
265 self.find_first(text).is_some()
266 }
267
268 pub fn stats(&self) -> EngineStats {
270 EngineStats {
271 algorithm: self.algorithm,
272 pattern_count: self.patterns.len(),
273 memory_usage: self.estimate_memory_usage(),
274 }
275 }
276
277 fn estimate_memory_usage(&self) -> usize {
279 let patterns_memory = self.patterns.iter().map(|p| p.len()).sum::<usize>();
280
281 let engine_memory = match self.algorithm {
282 MatchAlgorithm::WuManber => {
283 if let Some(wm) = &self.wm {
284 wm.memory_stats().total_memory
285 } else {
286 0
287 }
288 }
289 _ => patterns_memory * 2, };
291
292 patterns_memory + engine_memory
293 }
294}
295
296#[derive(Debug, Clone)]
298pub struct MatchInfo {
299 pub pattern: String,
300 pub start: usize,
301 pub end: usize,
302}
303
304#[derive(Debug, Clone)]
306pub struct EngineStats {
307 pub algorithm: MatchAlgorithm,
308 pub pattern_count: usize,
309 pub memory_usage: usize,
310}