sensitive_rs/engine/
mod.rs1pub(crate) mod wumanber;
2use crate::WuManber;
3use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
4use regex::Regex;
5use std::sync::Arc;
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum MatchAlgorithm {
10 AhoCorasick, WuManber, Regex, }
14
15pub struct MultiPatternEngine {
17 algorithm: MatchAlgorithm, ac: Option<Arc<AhoCorasick>>, wm: Option<Arc<WuManber>>, regex_set: Option<Regex>, patterns: Vec<String>, }
23
24impl Default for MultiPatternEngine {
25 fn default() -> Self {
26 Self { algorithm: MatchAlgorithm::AhoCorasick, ac: None, wm: None, regex_set: None, patterns: Vec::new() }
27 }
28}
29
30impl MultiPatternEngine {
31 pub fn new(algorithm: Option<MatchAlgorithm>, patterns: &[String]) -> Self {
33 let algorithm = algorithm.unwrap_or_else(|| Self::recommend_algorithm(patterns.len()));
34 let mut engine = Self { algorithm, ..Default::default() };
35
36 engine.rebuild(patterns);
37 engine
38 }
39
40 pub fn rebuild(&mut self, patterns: &[String]) {
42 self.patterns = patterns.to_vec();
43
44 let recommended = Self::recommend_algorithm(patterns.len());
46 if self.algorithm != recommended {
47 self.algorithm = recommended;
48 }
49
50 self.build_engines();
51 }
52
53 pub fn recommend_algorithm(word_count: usize) -> MatchAlgorithm {
55 match word_count {
56 0..=100 => MatchAlgorithm::WuManber, 101..=10_000 => MatchAlgorithm::AhoCorasick, _ => MatchAlgorithm::Regex, }
60 }
61
62 pub fn rebuild_with_algorithm(&mut self, patterns: &[String], algorithm: MatchAlgorithm) {
64 self.patterns = patterns.to_vec();
65 self.algorithm = algorithm;
66 self.build_engines();
67 }
68
69 fn build_engines(&mut self) {
71 self.ac = None;
73 self.wm = None;
74 self.regex_set = None;
75
76 match self.algorithm {
78 MatchAlgorithm::AhoCorasick => {
79 if !self.patterns.is_empty() {
80 match AhoCorasickBuilder::new()
81 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
82 .build(&self.patterns)
83 {
84 Ok(ac) => self.ac = Some(Arc::new(ac)),
85 Err(_) => {
86 self.algorithm = MatchAlgorithm::WuManber;
88 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
89 }
90 }
91 }
92 }
93 MatchAlgorithm::WuManber => {
94 if !self.patterns.is_empty() {
95 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
96 }
97 }
98 MatchAlgorithm::Regex => {
99 if !self.patterns.is_empty() {
100 let escaped_patterns: Vec<String> = self.patterns.iter().map(|p| regex::escape(p)).collect();
101 let pattern = escaped_patterns.join("|");
102
103 match Regex::new(&pattern) {
104 Ok(regex) => self.regex_set = Some(regex),
105 Err(_) => {
106 self.algorithm = MatchAlgorithm::WuManber;
108 self.wm = Some(Arc::new(WuManber::new_chinese(self.patterns.clone())));
109 }
110 }
111 }
112 }
113 }
114 }
115
116 pub fn current_algorithm(&self) -> MatchAlgorithm {
118 self.algorithm
119 }
120
121 pub fn get_patterns(&self) -> &[String] {
123 &self.patterns
124 }
125
126 pub fn find_first(&self, text: &str) -> Option<String> {
128 match self.algorithm {
129 MatchAlgorithm::AhoCorasick => {
130 self.ac.as_ref()?.find(text).map(|mat| text[mat.start()..mat.end()].to_string())
131 }
132 MatchAlgorithm::WuManber => {
133 self.wm.as_ref()?.search_string(text)
135 }
136 MatchAlgorithm::Regex => self.regex_set.as_ref()?.find(text).map(|mat| mat.as_str().to_string()),
137 }
138 }
139
140 pub fn replace_all(&self, text: &str, replacement: &str) -> String {
142 match self.algorithm {
143 MatchAlgorithm::AhoCorasick => {
144 if let Some(ac) = &self.ac {
145 ac.replace_all(text, &[replacement]).to_string()
146 } else {
147 text.to_string()
148 }
149 }
150 MatchAlgorithm::WuManber => {
151 if let Some(wm) = &self.wm {
152 if replacement.is_empty() {
153 wm.remove_all(text)
154 } else {
155 let repl_char = replacement.chars().next().unwrap_or('*');
156 wm.replace_all(text, repl_char)
157 }
158 } else {
159 text.to_string()
160 }
161 }
162 MatchAlgorithm::Regex => {
163 if let Some(regex) = &self.regex_set {
164 regex.replace_all(text, replacement).to_string()
165 } else {
166 text.to_string()
167 }
168 }
169 }
170 }
171
172 pub fn find_all(&self, text: &str) -> Vec<String> {
174 match self.algorithm {
175 MatchAlgorithm::AhoCorasick => {
176 if let Some(ac) = &self.ac {
177 ac.find_iter(text).map(|mat| text[mat.start()..mat.end()].to_string()).collect()
178 } else {
179 Vec::new()
180 }
181 }
182 MatchAlgorithm::WuManber => {
183 if let Some(wm) = &self.wm {
184 wm.search_all_strings(text)
185 } else {
186 Vec::new()
187 }
188 }
189 MatchAlgorithm::Regex => {
190 if let Some(regex) = &self.regex_set {
191 regex.find_iter(text).map(|mat| mat.as_str().to_string()).collect()
192 } else {
193 Vec::new()
194 }
195 }
196 }
197 }
198
199 pub fn find_matches_with_positions(&self, text: &str) -> Vec<MatchInfo> {
201 match self.algorithm {
202 MatchAlgorithm::AhoCorasick => {
203 if let Some(ac) = &self.ac {
204 ac.find_iter(text)
205 .map(|mat| MatchInfo {
206 pattern: text[mat.start()..mat.end()].to_string(),
207 start: mat.start(),
208 end: mat.end(),
209 })
210 .collect()
211 } else {
212 Vec::new()
213 }
214 }
215 MatchAlgorithm::WuManber => {
216 if let Some(wm) = &self.wm {
217 wm.find_matches(text)
218 .into_iter()
219 .filter_map(|m| {
220 let pattern = text.get(m.start..m.end)?;
221 Some(MatchInfo { pattern: pattern.to_string(), start: m.start, end: m.end })
222 })
223 .collect()
224 } else {
225 Vec::new()
226 }
227 }
228 MatchAlgorithm::Regex => {
229 if let Some(regex) = &self.regex_set {
230 regex
231 .find_iter(text)
232 .map(|mat| MatchInfo { pattern: mat.as_str().to_string(), start: mat.start(), end: mat.end() })
233 .collect()
234 } else {
235 Vec::new()
236 }
237 }
238 }
239 }
240
241 pub fn contains_any(&self, text: &str) -> bool {
243 self.find_first(text).is_some()
244 }
245
246 pub fn stats(&self) -> EngineStats {
248 EngineStats {
249 algorithm: self.algorithm,
250 pattern_count: self.patterns.len(),
251 memory_usage: self.estimate_memory_usage(),
252 }
253 }
254
255 fn estimate_memory_usage(&self) -> usize {
257 let patterns_memory = self.patterns.iter().map(|p| p.len()).sum::<usize>();
258
259 let engine_memory = match self.algorithm {
260 MatchAlgorithm::WuManber => {
261 if let Some(wm) = &self.wm {
262 wm.memory_stats().total_memory
263 } else {
264 0
265 }
266 }
267 _ => patterns_memory * 2, };
269
270 patterns_memory + engine_memory
271 }
272}
273
274#[derive(Debug, Clone)]
276pub struct MatchInfo {
277 pub pattern: String,
278 pub start: usize,
279 pub end: usize,
280}
281
282#[derive(Debug, Clone)]
284pub struct EngineStats {
285 pub algorithm: MatchAlgorithm,
286 pub pattern_count: usize,
287 pub memory_usage: usize,
288}