1use hashbrown::HashMap;
2use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
3use smallvec::SmallVec;
4use std::collections::HashSet;
5use std::hash::{DefaultHasher, Hash, Hasher};
6use std::sync::Arc;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum SpaceHandling {
11 Strict,
13 IgnoreSpaces,
15 NormalizeSpaces,
17}
18
19#[derive(Debug, Clone)]
22pub struct WuManber {
23 patterns: Vec<Arc<String>>,
24 original_patterns: Vec<String>,
25 pattern_set: HashSet<Arc<String>>, min_len: usize,
27 block_size: usize,
28 shift_table: HashMap<u64, usize>,
29 hash_table: HashMap<u64, SmallVec<[usize; 4]>>,
30 space_handling: SpaceHandling,
31}
32
33impl WuManber {
34 pub fn new_chinese(patterns: Vec<String>) -> Self {
36 Self::new_with_space_handling(patterns, SpaceHandling::Strict)
37 }
38
39 pub fn new(patterns: Vec<String>, block_size: usize) -> Self {
41 Self::new_with_block_size_and_space_handling(patterns, block_size, SpaceHandling::Strict)
42 }
43
44 pub fn new_parallel(patterns: Vec<String>, block_size: usize) -> Self {
46 Self::new_parallel_with_space_handling(patterns, block_size, SpaceHandling::Strict)
47 }
48
49 pub fn new_with_space_handling(patterns: Vec<String>, space_handling: SpaceHandling) -> Self {
51 let block_size = Self::calculate_optimal_block_size(&patterns, space_handling);
52 Self::new_with_block_size_and_space_handling(patterns, block_size, space_handling)
53 }
54
55 pub fn new_with_block_size_and_space_handling(
57 patterns: Vec<String>,
58 block_size: usize,
59 space_handling: SpaceHandling,
60 ) -> Self {
61 Self::build_instance(patterns, block_size, false, space_handling)
62 }
63
64 pub fn new_parallel_with_space_handling(
66 patterns: Vec<String>,
67 block_size: usize,
68 space_handling: SpaceHandling,
69 ) -> Self {
70 Self::build_instance(patterns, block_size, true, space_handling)
71 }
72
73 fn calculate_optimal_block_size(patterns: &[String], space_handling: SpaceHandling) -> usize {
75 if patterns.is_empty() {
76 return 2;
77 }
78
79 let processed_patterns: Vec<String> =
80 patterns.iter().map(|p| Self::preprocess_pattern(p, space_handling)).filter(|p| !p.is_empty()).collect();
81
82 let min_len = processed_patterns.iter().map(|p| p.chars().count()).min().unwrap_or(2);
83
84 match min_len {
86 1 => 1,
87 2..=3 => 2,
88 4..=10 => 3,
89 _ => (min_len / 2).clamp(2, 4),
90 }
91 }
92
93 fn preprocess_pattern(pattern: &str, space_handling: SpaceHandling) -> String {
95 match space_handling {
96 SpaceHandling::Strict => pattern.to_string(),
97 SpaceHandling::IgnoreSpaces => pattern.chars().filter(|c| !c.is_whitespace()).collect(),
98 SpaceHandling::NormalizeSpaces => {
99 let mut result = String::new();
100 let mut prev_was_space = false;
101
102 for ch in pattern.chars() {
103 if ch.is_whitespace() {
104 if !prev_was_space {
105 result.push(' ');
106 prev_was_space = true;
107 }
108 } else {
109 result.push(ch);
110 prev_was_space = false;
111 }
112 }
113
114 result
115 }
116 }
117 }
118
119 fn preprocess_text(&self, text: &str) -> String {
121 Self::preprocess_pattern(text, self.space_handling)
122 }
123
124 fn empty() -> Self {
126 WuManber {
127 patterns: Vec::new(),
128 original_patterns: Vec::new(),
129 pattern_set: HashSet::new(),
130 min_len: 0,
131 block_size: 2,
132 shift_table: HashMap::new(),
133 hash_table: HashMap::new(),
134 space_handling: SpaceHandling::Strict,
135 }
136 }
137
138 fn build_instance(patterns: Vec<String>, block_size: usize, parallel: bool, space_handling: SpaceHandling) -> Self {
140 if patterns.is_empty() {
141 return Self::empty();
142 }
143
144 let original_patterns = patterns.clone();
146
147 let processed_patterns: Vec<String> =
149 patterns.iter().map(|p| Self::preprocess_pattern(p, space_handling)).filter(|p| !p.is_empty()).collect();
150
151 if processed_patterns.is_empty() {
152 return Self::empty();
153 }
154
155 let min_len = processed_patterns.iter().map(|p| p.chars().count()).min().unwrap_or(1);
156
157 let safe_block_size = block_size.min(min_len);
159
160 let patterns_arc: Vec<Arc<String>> = processed_patterns.into_iter().map(Arc::new).collect();
161 let pattern_set: HashSet<Arc<String>> = patterns_arc.iter().cloned().collect();
163 let mut instance = WuManber {
164 patterns: patterns_arc,
165 original_patterns,
166 pattern_set,
167 min_len,
168 block_size: safe_block_size,
169 shift_table: HashMap::new(),
170 hash_table: HashMap::new(),
171 space_handling,
172 };
173
174 if parallel {
175 instance.build_tables_parallel();
176 } else {
177 instance.build_tables();
178 }
179
180 instance
181 }
182
183 fn build_tables(&mut self) {
185 self.build_shift_table();
186 self.build_hash_table();
187 }
188
189 fn build_tables_parallel(&mut self) {
191 self.build_shift_table_parallel();
192 self.build_hash_table_parallel();
193 }
194
195 fn build_shift_table(&mut self) {
197 for pattern in self.patterns.iter() {
198 let chars: Vec<char> = pattern.chars().collect();
199 let char_count = chars.len();
200
201 if char_count < self.block_size {
203 continue;
204 }
205
206 for i in 0..=(char_count - self.block_size) {
207 let block = self.extract_block_optimized(&chars, i);
208 let hash = Self::calculate_hash_fast(&block);
209 let shift = char_count - i - self.block_size;
210
211 self.shift_table.entry(hash).and_modify(|v| *v = (*v).min(shift)).or_insert(shift);
212 }
213 }
214 }
215
216 fn build_hash_table(&mut self) {
218 for (pattern_idx, pattern) in self.patterns.iter().enumerate() {
219 let chars: Vec<char> = pattern.chars().collect();
220 let char_count = chars.len();
221
222 if char_count >= self.block_size {
223 let start_pos = char_count - self.block_size;
224 let block = self.extract_block_optimized(&chars, start_pos);
225 let hash = Self::calculate_hash_fast(&block);
226
227 self.hash_table.entry(hash).or_default().push(pattern_idx);
228 }
229 }
230 }
231
232 fn build_shift_table_parallel(&mut self) {
234 let block_size = self.block_size;
235
236 let shift_entries: Vec<(u64, usize)> = self
237 .patterns
238 .par_iter()
239 .flat_map(|pattern| {
240 let chars: Vec<char> = pattern.chars().collect();
241 let char_count = chars.len();
242
243 if char_count < block_size {
244 return Vec::new();
245 }
246
247 (0..=(char_count - block_size))
248 .map(move |i| {
249 let block = chars[i..i + block_size].iter().collect::<String>();
250 let hash = Self::calculate_hash_fast(&block);
251 let shift = char_count - i - block_size;
252 (hash, shift)
253 })
254 .collect::<Vec<_>>()
255 })
256 .collect();
257
258 for (hash, shift) in shift_entries {
259 self.shift_table.entry(hash).and_modify(|v| *v = (*v).min(shift)).or_insert(shift);
260 }
261 }
262
263 fn build_hash_table_parallel(&mut self) {
265 let block_size = self.block_size;
266
267 let hash_entries: Vec<(u64, usize)> = self
268 .patterns
269 .par_iter()
270 .enumerate()
271 .filter_map(|(pattern_idx, pattern)| {
272 let chars: Vec<char> = pattern.chars().collect();
273 let char_count = chars.len();
274
275 if char_count >= block_size {
276 let start_pos = char_count - block_size;
277 let block = chars[start_pos..start_pos + block_size].iter().collect::<String>();
278 let hash = Self::calculate_hash_fast(&block);
279 Some((hash, pattern_idx))
280 } else {
281 None
282 }
283 })
284 .collect();
285
286 for (hash, pattern_idx) in hash_entries {
287 self.hash_table.entry(hash).or_insert_with(SmallVec::new).push(pattern_idx);
288 }
289 }
290
291 #[inline]
293 fn extract_block_optimized(&self, chars: &[char], start: usize) -> String {
294 chars[start..start + self.block_size].iter().collect()
295 }
296
297 #[inline]
299 fn calculate_hash_fast(s: &str) -> u64 {
300 let mut hasher = DefaultHasher::new();
301 s.hash(&mut hasher);
302 hasher.finish()
303 }
304
305 pub fn search(&self, text: &str) -> Option<Arc<String>> {
307 if self.patterns.is_empty() || text.is_empty() {
308 return None;
309 }
310
311 if self.space_handling != SpaceHandling::Strict {
312 return self.search_with_preprocessing(text);
313 }
314
315 let chars: Vec<char> = text.chars().collect();
316 let text_len = chars.len();
317
318 if text_len < self.min_len {
319 return None;
320 }
321
322 let mut pos = self.min_len - 1;
323
324 while pos < text_len {
325 if pos + 1 < self.block_size {
326 pos += 1;
327 continue;
328 }
329
330 let block_start = pos + 1 - self.block_size;
331 let block = self.extract_block_optimized(&chars, block_start);
332 let hash = Self::calculate_hash_fast(&block);
333
334 if let Some(&shift) = self.shift_table.get(&hash) {
335 if shift == 0 {
336 if let Some(pattern_indices) = self.hash_table.get(&hash) {
337 if let Some(result) = self.verify_matches_arc(&chars, pos, pattern_indices) {
338 return Some(result);
339 }
340 }
341
342 let prev_block_start = block_start.saturating_sub(1);
343 if prev_block_start < block_start {
344 let prev_block = self.extract_block_optimized(&chars, prev_block_start);
345 let prev_hash = Self::calculate_hash_fast(&prev_block);
346 if let Some(prev_pattern_indices) = self.hash_table.get(&prev_hash) {
347 if let Some(found) = self.verify_matches_arc(&chars, pos - 1, prev_pattern_indices) {
349 return Some(found);
350 }
351 }
352 }
353
354 pos += 1;
355 } else {
356 pos += shift;
357 }
358 } else {
359 pos += self.min_len.saturating_sub(self.block_size).saturating_add(1);
361 }
362 }
363
364 None
365 }
366
367 fn search_with_preprocessing(&self, text: &str) -> Option<Arc<String>> {
369 for (i, original_pattern) in self.original_patterns.iter().enumerate() {
370 let processed_text = self.preprocess_text(text);
371 let processed_pattern = Self::preprocess_pattern(original_pattern, self.space_handling);
372
373 if processed_text.contains(&processed_pattern) {
374 return self.patterns.get(i).cloned();
375 }
376 }
377 None
378 }
379
380 fn verify_matches_arc(&self, chars: &[char], pos: usize, pattern_indices: &[usize]) -> Option<Arc<String>> {
382 for &pattern_idx in pattern_indices {
383 if let Some(pattern) = self.patterns.get(pattern_idx) {
384 let pattern_chars: Vec<char> = pattern.chars().collect();
385 let pattern_len = pattern_chars.len();
386
387 if pos + 1 >= pattern_len {
388 let start = pos + 1 - pattern_len;
389 if chars[start..start + pattern_len] == pattern_chars[..] {
390 return Some(pattern.clone());
391 }
392 }
393 }
394 }
395 None
396 }
397
398 pub fn search_string(&self, text: &str) -> Option<String> {
400 if self.space_handling != SpaceHandling::Strict {
401 for original_pattern in &self.original_patterns {
403 let processed_text = self.preprocess_text(text);
404 let processed_pattern = Self::preprocess_pattern(original_pattern, self.space_handling);
405 if processed_text.contains(&processed_pattern) {
406 return Some(original_pattern.clone());
407 }
408 }
409 return None;
410 }
411
412 self.search(text).map(|arc| (*arc).clone())
413 }
414
415 pub fn search_all(&self, text: &str) -> Vec<Arc<String>> {
417 if self.patterns.is_empty() || text.is_empty() {
418 return Vec::new();
419 }
420
421 let mut results = Vec::new();
422
423 for (i, original_pattern) in self.original_patterns.iter().enumerate() {
425 let matches = match self.space_handling {
426 SpaceHandling::Strict => text.contains(original_pattern),
427 _ => {
428 let processed_text = self.preprocess_text(text);
429 let processed_pattern = Self::preprocess_pattern(original_pattern, self.space_handling);
430 processed_text.contains(&processed_pattern)
431 }
432 };
433
434 if matches {
435 if let Some(pattern) = self.patterns.get(i) {
436 results.push(pattern.clone());
437 }
438 }
439 }
440
441 results
442 }
443
444 pub fn search_all_strings(&self, text: &str) -> Vec<String> {
446 if self.patterns.is_empty() || text.is_empty() {
447 return Vec::new();
448 }
449
450 let mut results = Vec::new();
451
452 for original_pattern in &self.original_patterns {
453 let matches = match self.space_handling {
454 SpaceHandling::Strict => text.contains(original_pattern),
455 _ => {
456 let processed_text = self.preprocess_text(text);
457 let processed_pattern = Self::preprocess_pattern(original_pattern, self.space_handling);
458 processed_text.contains(&processed_pattern)
459 }
460 };
461
462 if matches {
463 results.push(original_pattern.clone());
464 }
465 }
466
467 results
468 }
469
470 pub fn replace_all(&self, text: &str, replacement: char) -> String {
472 let mut result = text.to_string();
473
474 for pattern in &self.original_patterns {
475 let replacement_str = replacement.to_string().repeat(pattern.chars().count());
476 result = result.replace(pattern, &replacement_str);
477 }
478
479 result
480 }
481
482 pub fn remove_all(&self, text: &str) -> String {
484 let mut result = text.to_string();
485
486 for pattern in &self.original_patterns {
487 result = result.replace(pattern, "");
488 }
489
490 result
491 }
492
493 pub fn find_matches(&self, text: &str) -> Vec<Match> {
495 let mut matches = Vec::new();
496
497 for pattern in &self.original_patterns {
498 let mut start = 0;
499 while let Some(pos) = text[start..].find(pattern) {
500 let absolute_start = start + pos;
501 let absolute_end = absolute_start + pattern.len();
502 matches.push(Match { start: absolute_start, end: absolute_end });
503 start = absolute_start + 1;
504 }
505 }
506
507 matches.sort_by_key(|m| m.start);
508 matches
509 }
510
511 pub fn patterns(&self) -> &[Arc<String>] {
513 &self.patterns
514 }
515
516 pub fn patterns_strings(&self) -> Vec<String> {
518 self.original_patterns.clone()
519 }
520
521 pub fn contains_pattern(&self, pattern: &str) -> bool {
523 let target = Arc::new(pattern.to_string());
524 self.pattern_set.contains(&target)
525 }
526
527 pub fn space_handling(&self) -> SpaceHandling {
529 self.space_handling
530 }
531
532 pub fn memory_stats(&self) -> WuManberMemoryStats {
534 let patterns_memory = self.patterns.iter().map(|p| size_of::<Arc<String>>() + p.len()).sum::<usize>();
535
536 let shift_table_memory = self.shift_table.len() * (size_of::<u64>() + size_of::<usize>());
537 let hash_table_memory = self
538 .hash_table
539 .iter()
540 .map(|(_k, v)| size_of::<u64>() + size_of::<SmallVec<[usize; 4]>>() + v.len() * size_of::<usize>())
541 .sum::<usize>();
542
543 let pattern_set_memory = self.pattern_set.len() * size_of::<Arc<String>>();
544
545 let total_memory = patterns_memory + shift_table_memory + hash_table_memory + pattern_set_memory;
546
547 WuManberMemoryStats {
548 total_patterns: self.patterns.len(),
549 patterns_memory,
550 shift_table_memory,
551 hash_table_memory,
552 total_memory,
553 }
554 }
555
556 pub fn stats(&self) -> WuManberStats {
558 WuManberStats {
559 pattern_count: self.patterns.len(),
560 min_length: self.min_len,
561 block_size: self.block_size,
562 shift_table_size: self.shift_table.len(),
563 hash_table_size: self.hash_table.len(),
564 }
565 }
566}
567
568#[derive(Debug, Clone, Copy, PartialEq, Eq)]
570pub struct Match {
571 pub start: usize,
572 pub end: usize,
573}
574
575#[derive(Debug, Clone)]
577pub struct WuManberStats {
578 pub pattern_count: usize,
579 pub min_length: usize,
580 pub block_size: usize,
581 pub shift_table_size: usize,
582 pub hash_table_size: usize,
583}
584
585#[derive(Debug, Clone)]
587pub struct WuManberMemoryStats {
588 pub total_patterns: usize,
589 pub patterns_memory: usize,
590 pub shift_table_memory: usize,
591 pub hash_table_memory: usize,
592 pub total_memory: usize,
593}
594
595#[cfg(test)]
596mod tests {
597 use super::*;
598
599 #[test]
600 fn test_basic_matching() {
601 let wm =
602 WuManber::new_chinese(vec!["色情".to_string(), "赌博".to_string(), "诈骗".to_string(), "扯蛋".to_string()]);
603
604 assert_eq!(wm.search_string("正常内容"), None);
605 assert_eq!(wm.search_string("测试含有赌博内容"), Some("赌博".to_string()));
606 assert_eq!(wm.search_string("测试还有色情赌博图片"), Some("色情".to_string()));
607 assert_eq!(wm.search_string("测试有色情赌博图片"), Some("色情".to_string()));
608 assert_eq!(wm.search_string("还有诈骗色情测试"), Some("诈骗".to_string()));
609 }
610
611 #[test]
612 fn test_varied_length() {
613 let wm = WuManber::new(vec!["赌".to_string(), "赌博".to_string(), "赌博机".to_string()], 1);
614
615 assert_eq!(wm.search_string("赌"), Some("赌".to_string()));
616 assert_eq!(wm.search_string("赌博"), Some("赌".to_string())); assert_eq!(wm.search_string("赌博机"), Some("赌".to_string())); }
619
620 #[test]
621 fn test_space_handling_strategies() {
622 let patterns = vec!["hello world".to_string(), "test pattern".to_string()];
623
624 let wm_strict = WuManber::new_with_space_handling(patterns.clone(), SpaceHandling::Strict);
626 assert_eq!(wm_strict.search_string("hello world"), Some("hello world".to_string()));
627 assert_eq!(wm_strict.search_string("helloworld"), None);
628
629 let wm_ignore = WuManber::new_with_space_handling(patterns.clone(), SpaceHandling::IgnoreSpaces);
631 assert_eq!(wm_ignore.search_string("hello world"), Some("hello world".to_string()));
632 assert_eq!(wm_ignore.search_string("helloworld"), Some("hello world".to_string()));
633
634 let wm_normalize = WuManber::new_with_space_handling(patterns.clone(), SpaceHandling::NormalizeSpaces);
636 assert_eq!(wm_normalize.search_string("test pattern"), Some("test pattern".to_string()));
637 assert_eq!(wm_normalize.search_string("test pattern"), Some("test pattern".to_string()));
638 }
639
640 #[test]
641 fn test_mixed_patterns() {
642 let mixed_patterns =
643 vec!["关键词2500".to_string(), "关键词 3000".to_string(), "test".to_string(), "hello world".to_string()];
644
645 let wm = WuManber::new_chinese(mixed_patterns);
646
647 assert_eq!(wm.search_string("包含关键词2500的文本"), Some("关键词2500".to_string()));
648 assert_eq!(wm.search_string("包含关键词 3000 的文本"), Some("关键词 3000".to_string()));
649 assert_eq!(wm.search_string("this is test"), Some("test".to_string()));
650 assert_eq!(wm.search_string("say hello world"), Some("hello world".to_string()));
651 }
652
653 #[test]
654 fn test_complex_patterns_with_spaces() {
655 let complex_patterns = vec![
656 "Hello World".to_string(),
657 "你好 世界".to_string(),
658 "multiple spaces".to_string(),
659 "tab\there".to_string(),
660 "new\nline".to_string(),
661 ];
662
663 let wm = WuManber::new_chinese(complex_patterns);
664
665 assert_eq!(wm.search_string("Say Hello World"), Some("Hello World".to_string()));
666 assert_eq!(wm.search_string("说你好 世界"), Some("你好 世界".to_string()));
667 assert_eq!(wm.search_string("has multiple spaces here"), Some("multiple spaces".to_string()));
668 assert_eq!(wm.search_string("tab\there you go"), Some("tab\there".to_string()));
669 assert_eq!(wm.search_string("new\nline break"), Some("new\nline".to_string()));
670 }
671
672 #[test]
673 fn test_ignore_spaces_functionality() {
674 let patterns = vec!["关键词 2500".to_string(), "hello world".to_string()];
675 let wm = WuManber::new_with_space_handling(patterns, SpaceHandling::IgnoreSpaces);
676
677 assert_eq!(wm.search_string("关键词2500"), Some("关键词 2500".to_string()));
679 assert_eq!(wm.search_string("关键词 2500"), Some("关键词 2500".to_string()));
680 assert_eq!(wm.search_string("关键词 2500"), Some("关键词 2500".to_string()));
681 assert_eq!(wm.search_string("helloworld"), Some("hello world".to_string()));
682 assert_eq!(wm.search_string("hello world"), Some("hello world".to_string()));
683 }
684
685 #[test]
686 fn test_parallel_performance() {
687 let patterns: Vec<String> = (0..5000).map(|i| format!("关键词{i}")).collect();
688
689 let wm_seq = WuManber::new(patterns.clone(), 2);
690 let wm_par = WuManber::new_parallel(patterns, 2);
691
692 let text = "这里包含关键词2500和关键词3000";
693
694 let result_seq = wm_seq.search_all_strings(text);
695 let result_par = wm_par.search_all_strings(text);
696
697 assert_eq!(result_seq.len(), result_par.len());
698 assert!(result_seq.contains(&"关键词2500".to_string()));
699 assert!(result_seq.contains(&"关键词3000".to_string()));
700
701 for item in &result_seq {
702 assert!(result_par.contains(item));
703 }
704 }
705
706 #[test]
707 fn test_search_all_functionality() {
708 let wm = WuManber::new_chinese(vec!["苹果".to_string(), "香蕉".to_string(), "橙子".to_string()]);
709
710 let text = "我喜欢吃苹果、香蕉和橙子";
711 let results = wm.search_all_strings(text);
712
713 assert_eq!(results.len(), 3);
714 assert!(results.contains(&"苹果".to_string()));
715 assert!(results.contains(&"香蕉".to_string()));
716 assert!(results.contains(&"橙子".to_string()));
717 }
718}