matcher_rs 0.1.7

A high performance multiple functional word matcher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
use std::borrow::Cow;
use std::intrinsics::{likely, unlikely};
use std::iter;
use std::simd::Simd;

use aho_corasick::{AhoCorasick, AhoCorasickBuilder, AhoCorasickKind::DFA, MatchKind};
use gxhash::{HashMap as GxHashMap, HashSet as GxHashSet};
use nohash_hasher::{IntMap, IntSet};
use serde::Serialize;
use tinyvec::ArrayVec;

use super::{MatchResultTrait, StrConvType, TextMatcherTrait};

/// This section includes constant string references to various conversion maps.
///
/// Each constant is assigned the contents of a corresponding text file using
/// `include_str!`. This macro inserts the contents of the given file into the binary as
/// a `&'static str`. These files contain mappings for string transformations used in the
/// text processing implemented by the `SimpleMatcher` struct. These mappings are expected to be found
/// at the relative paths provided in the macro argument.
///
/// Included conversion maps:
///
/// - `FANJIAN`: Maps simplified Chinese characters to traditional Chinese characters.
/// - `CN_SPECIAL`: Maps Chinese special characters to their normalized forms.
/// - `EN_SPECIAL`: Maps English special characters to their normalized forms.
/// - `PUNCTUATION_SPECIAL`: Maps various punctuation marks to an empty string (effectively deleting them).
/// - `EN_VARIATION`: Maps English characters in different variations to their standard forms.
/// - `UNICODE`: Maps various Unicode characters to their equivalent forms.
/// - `NUM_NORM`: Maps numeric characters to their normalized forms.
/// - `UPPER_LOWER`: Maps uppercase English characters to their lowercase equivalents.
/// - `PINYIN`: Maps Chinese characters to Pinyin representations.
/// - `PINYIN_CHAR`: Maps individual Chinese characters to their Pinyin equivalents.
const FANJIAN: &str = include_str!("../str_conv_map/FANJIAN.txt");
const CN_SPECIAL: &str = include_str!("../str_conv_map/CN-SPECIAL.txt");
const EN_SPECIAL: &str = include_str!("../str_conv_map/EN-SPECIAL.txt");
const PUNCTUATION_SPECIAL: &str = include_str!("../str_conv_map/PUNCTUATION-SPECIAL.txt");
const EN_VARIATION: &str = include_str!("../str_conv_map/EN-VARIATION.txt");
const UNICODE: &str = include_str!("../str_conv_map/UNICODE.txt");
const NUM_NORM: &str = include_str!("../str_conv_map/NUM-NORM.txt");
const UPPER_LOWER: &str = include_str!("../str_conv_map/UPPER-LOWER.txt");
const PINYIN: &str = include_str!("../str_conv_map/PINYIN.txt");
const PINYIN_CHAR: &str = include_str!("../str_conv_map/PINYIN-CHAR.txt");

/// A constant slice containing string references to various Unicode whitespace characters.
///
/// These characters include:
///
/// - Horizontal tab (`\u{0009}`).
/// - Line feed (`\u{000A}`).
/// - Vertical tab (`\u{000B}`).
/// - Form feed (`\u{000C}`).
/// - Carriage return (`\u{000D}`).
/// - Space (`\u{0020}`).
/// - Next line (`\u{0085}`).
/// - No-break space (`\u{00A0}`).
/// - Ogham space mark (`\u{1680}`).
/// - En quad (`\u{2000}`).
/// - Em quad (`\u{2001}`).
/// - En space (`\u{2002}`).
/// - Em space (`\u{2003}`).
/// - Three-per-em space (`\u{2004}`).
/// - Four-per-em space (`\u{2005}`).
/// - Six-per-em space (`\u{2006}`).
/// - Figure space (`\u{2007}`).
/// - Punctuation space (`\u{2008}`).
/// - Thin space (`\u{2009}`).
/// - Hair space (`\u{200A}`).
/// - Line separator (`\u{2028}`).
/// - Paragraph separator (`\u{2029}`).
/// - Narrow no-break space (`\u{202F}`).
/// - Medium mathematical space (`\u{205F}`).
/// - Ideographic space (`\u{3000}`).
const WHITE_SPACE: &[&str] = &[
    "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", "\u{00A0}",
    "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}",
    "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}",
    "\u{3000}",
];

/// The maximum limit of word combinations that are considered for matches.
/// This value is used to limit the number of different word combinations the algorithm evaluates.
const WORD_COMBINATION_LIMIT: usize = 32;
const ZEROS: Simd<u8, WORD_COMBINATION_LIMIT> = Simd::from_array([0; WORD_COMBINATION_LIMIT]);

/// Type alias for `StrConvType` which is used to represent various string
/// conversion modes in the text matcher. This alias simplifies the
/// representation and usage of `StrConvType` throughout the `SimpleMatcher`
/// implementation.
///
/// `StrConvType` includes different conversion types which determine the
/// preprocessing steps applied to the text before matching, such as
/// normalization, punctuation deletion, and more.
pub type SimpleMatchType = StrConvType;

/// Type alias for a mapping between `SimpleMatchType` and an `IntMap` containing word IDs
/// and references to associated words.
///
/// This alias is used within the `SimpleMatcher` struct to define and organize the
/// different word maps associated with each `SimpleMatchType`. The key is a `SimpleMatchType`
/// which indicates the type of string conversion to be applied, and the value is an
/// `IntMap` where the key is a `u64` representing a unique word ID, and the value is
/// a reference to a string slice representing the word.
///
/// The lifetime parameter `'a` ensures that the string slices referenced in the map
/// live at least as long as the map itself.
///
/// # Example
///
/// ```rust
/// use matcher_rs::{SimpleMatchType, SimpleMatchTypeWordMap};
/// use gxhash::HashMap as GxHashMap;
/// use nohash_hasher::IntMap;
///
/// let mut simple_match_type_word_map: SimpleMatchTypeWordMap<'_> = GxHashMap::default();
/// let mut simple_word_map = IntMap::default();
///
/// simple_word_map.insert(1, "你好");
/// simple_word_map.insert(2, "123");
///
/// simple_match_type_word_map.insert(SimpleMatchType::Fanjian, simple_word_map);
/// ```
///
pub type SimpleMatchTypeWordMap<'a> = GxHashMap<SimpleMatchType, IntMap<u64, &'a str>>;

/// Configuration for a word used in the `SimpleMatcher`.
///
/// `WordConf` structure stores the configuration associated with a word. This includes the word itself
/// as a `String` and a SIMD vector `split_bit` that helps with efficient matching and processing of
/// word combinations.
///
/// Fields:
///
/// * `word` - A `String` representing the word.
/// * `split_bit` - A `Simd` vector of `u8` with a size of `WORD_COMBINATION_LIMIT`.
///   This vector is used to store bitwise information for word combination matching,
///   allowing optimized and efficient comparisons.
struct WordConf {
    word: String,
    split_bit: Simd<u8, WORD_COMBINATION_LIMIT>,
}

/// A structure used within the `SimpleMatcher` to associate the text processing
/// table (`ac_matcher`) with its corresponding word configurations (`ac_word_conf_list`).
///
/// The `SimpleAcTable` struct is essential for the functioning of the Aho-Corasick
/// automaton, enabling efficient text search and match operations.
///
/// # Fields
///
/// * `ac_matcher` - An instance of the `AhoCorasick` Aho-Corasick automaton for
///   efficiently finding patterns in text. Built from a list of patterns that require
///   processing.
///
/// * `ac_word_conf_list` - A vector containing tuples of word IDs and corresponding
///   offsets. It serves as a map between the patterns used by `ac_matcher` and
///   their respective configurations, indicated by the word ID and its specific offset.
///
/// This structure works by holding the compiled automaton (`ac_matcher`) which
/// quickly searches through text for various patterns. The patterns correspond to
/// word configurations stored in `ac_word_conf_list`, linking matched positions in the
/// text to predefined transformations or actions.
struct SimpleAcTable {
    ac_matcher: AhoCorasick,
    ac_word_conf_list: Vec<(u64, usize)>,
}

#[derive(Debug, Serialize)]
/// A struct representing a result for a matched word in the `SimpleMatcher`.
///
/// `SimpleResult` is used to encapsulate information about a word that has been
/// matched by the `SimpleMatcher` during text processing. It includes the word's
/// unique identifier and the corresponding word itself.
///
/// # Fields
///
/// * `word_id` - A `u64` representing a unique identifier for the matched word.
/// * `word` - A `Cow` (Clone on Write) representing the matched word. Using `Cow`
///   allows the struct to efficiently manage borrowed or owned data.
///
/// # Example
///
/// ```
/// use std::borrow::Cow;
/// use matcher_rs::SimpleResult;
///
/// let result = SimpleResult {
///     word_id: 42,
///     word: Cow::Borrowed("example"),
/// };
///
/// println!("Matched word: ID = {}, word = {}", result.word_id, result.word);
/// ```
pub struct SimpleResult<'a> {
    pub word_id: u64,
    pub word: Cow<'a, str>,
}

impl MatchResultTrait<'_> for SimpleResult<'_> {
    /// Returns the unique identifier of the matched word.
    ///
    /// This method provides the ID associated with a word that has been
    /// matched by the `SimpleMatcher`. The ID is useful for identifying and
    /// differentiating between multiple matched words.
    ///
    /// # Returns
    ///
    /// * `u64` - A 64-bit unsigned integer representing the unique identifier
    /// for the matched word.
    ///
    /// # Example
    ///
    /// ```
    /// use matcher_rs::{SimpleResult, MatchResultTrait};
    /// use std::borrow::Cow;
    ///
    /// let result = SimpleResult {
    ///     word_id: 42,
    ///     word: Cow::Borrowed("example"),
    /// };
    ///
    /// assert_eq!(result.word_id(), 42);
    /// ```
    fn word_id(&self) -> u64 {
        self.word_id
    }
    fn word(&self) -> &str {
        self.word.as_ref()
    }
}

/// `SimpleMatcher` is a main structure for performing text matching operations
/// using various pre-defined string transformation rules.
///
/// This structure holds several maps and configurations that facilitate efficient
/// text processing and matching using the Aho-Corasick algorithm. The primary aim is
/// to transform and match input text against a set of rules and patterns efficiently.
///
/// # Fields
///
/// * `simple_match_type_process_map` - A mapping between `SimpleMatchType` and a tuple
/// containing a list of replacement strings and an instance of `AhoCorasick`. It is used
/// for pre-processing input text using specific transformation rules.
///
/// * `simple_match_type_ac_table_map` - A mapping between `SimpleMatchType` and `SimpleAcTable`.
/// This map holds the compiled Aho-Corasick automata (AC tables) and their associated word
/// configurations that are used for the efficient pattern matching.
///
/// * `simple_wordconf_map` - An `IntMap` that maps unique word IDs (`u64`) to their corresponding
/// `WordConf` configurations. This helps in storing custom configurations for words,
/// including SIMD vectors for efficient combination matching.
///
/// * `min_chars_count` - A `usize` value representing the minimum number of characters
/// required for a valid match. It is determined based on the words present in the matcher.
/// This value helps in optimizing the matching process by filtering out text that is too
/// short to contain any valid matches.
pub struct SimpleMatcher {
    simple_match_type_process_map: GxHashMap<SimpleMatchType, (Vec<&'static str>, AhoCorasick)>,
    simple_match_type_ac_table_map: GxHashMap<SimpleMatchType, SimpleAcTable>,
    simple_wordconf_map: IntMap<u64, WordConf>,
    min_chars_count: usize,
}

impl SimpleMatcher {
    /// Creates a new instance of `SimpleMatcher` using the provided `SimpleMatchTypeWordMap`.
    ///
    /// This constructor initializes the `SimpleMatcher` by setting up process maps and
    /// Aho-Corasick (AC) tables for efficient text matching. The mappings and configurations
    /// for each `SimpleMatchType` are extracted from the provided word map and stored
    /// within the matcher for later use.
    ///
    /// # Arguments
    ///
    /// * `simple_match_type_word_map` - A reference to a `SimpleMatchTypeWordMap` which
    /// contains the mappings between various `SimpleMatchType`s and their corresponding
    /// word maps (`IntMap<u64, &str>`). This serves as the input for initializing the
    /// matcher with the required configurations and patterns.
    ///
    /// # Returns
    ///
    /// A new instance of `SimpleMatcher` with all the necessary mappings and configurations
    /// set up for text matching.
    ///
    /// # Example
    ///
    /// ```rust
    /// use matcher_rs::{SimpleMatchType, SimpleMatchTypeWordMap, SimpleMatcher};
    /// use gxhash::HashMap as GxHashMap;
    /// use nohash_hasher::IntMap;
    ///
    /// let mut simple_match_type_word_map: SimpleMatchTypeWordMap<'_> = GxHashMap::default();
    /// let mut simple_word_map = IntMap::default();
    ///
    /// simple_word_map.insert(1, "你好");
    /// simple_word_map.insert(2, "123");
    ///
    /// simple_match_type_word_map.insert(SimpleMatchType::Fanjian, simple_word_map);
    ///
    /// let matcher = SimpleMatcher::new(simple_match_type_word_map);
    /// ```
    pub fn new<'a, I, M>(simple_match_type_word_map: I) -> SimpleMatcher
    where
        I: IntoIterator<Item = (SimpleMatchType, M)>,
        M: IntoIterator<Item = (u64, &'a str)>,
    {
        // Create a new instance of SimpleMatcher with default values
        let mut simple_matcher = SimpleMatcher {
            simple_match_type_process_map: GxHashMap::default(),
            simple_match_type_ac_table_map: GxHashMap::default(),
            simple_wordconf_map: IntMap::default(),
            min_chars_count: usize::MAX,
        };

        // Iterate over each entry in the provided SimpleMatchTypeWordMap
        for (simple_match_type, simple_word_map) in simple_match_type_word_map {
            // Iterate over each bit set in the SimpleMatchType
            for simple_match_type_bit in simple_match_type.iter() {
                // Insert a new process matcher into the process map if it doesn't exist already
                simple_matcher
                    .simple_match_type_process_map
                    .entry(simple_match_type_bit)
                    .or_insert_with(|| Self::_get_process_matcher(&simple_match_type_bit));
            }

            // Build the Aho-Corasick table for the current SimpleMatchType excluding TextDelete
            let simple_ac_table = simple_matcher.build_simple_ac_table(
                &(simple_match_type - SimpleMatchType::TextDelete),
                simple_word_map,
            );

            // Insert the built AC table into the AC table map,
            // using SimpleMatchType excluding WordDelete as the key
            simple_matcher.simple_match_type_ac_table_map.insert(
                simple_match_type - SimpleMatchType::WordDelete,
                simple_ac_table,
            );
        }

        // Return the configured SimpleMatcher instance
        simple_matcher
    }

    /// Generates the process matcher for a given `SimpleMatchType`.
    ///
    /// This function constructs a mapping between input patterns and their corresponding
    /// replacement strings based on the provided `SimpleMatchType`. The patterns and their
    /// replacements are used to build an Aho-Corasick (AC) automaton, which efficiently
    /// matches and replaces text during the preprocessing phase.
    ///
    /// The function considers various string transformation rules, which are classified
    /// under different `SimpleMatchType` values such as `Fanjian`, `WordDelete`, `TextDelete`,
    /// `Normalize`, `PinYin`, and `PinYinChar`. Depending on the type, it loads the
    /// corresponding conversion data, creates a dictionary of patterns and their replacements,
    /// and then builds the AC automaton.
    ///
    /// # Arguments
    ///
    /// * `simple_match_type_bit` - A reference to a `SimpleMatchType` indicating the type
    /// of string transformation to be applied.
    ///
    /// # Returns
    ///
    /// A tuple containing:
    /// * `process_replace_list` - A vector of replacement strings (&'static str) used for
    ///   transforming the input patterns.
    /// * `process_matcher` - An `AhoCorasick` automaton built using the input patterns,
    ///   which facilitates efficient matching and replacement operations.
    ///
    /// # Example
    ///
    /// ```rust
    /// use matcher_rs::{SimpleMatchType, SimpleMatcher};
    ///
    /// let simple_match_type = SimpleMatchType::Fanjian;
    /// let (process_replace_list, process_matcher) = SimpleMatcher::_get_process_matcher(&simple_match_type);
    ///
    /// // Use the returned process_replace_list and process_matcher for further text processing
    /// ```
    pub fn _get_process_matcher(
        simple_match_type_bit: &SimpleMatchType,
    ) -> (Vec<&'static str>, AhoCorasick) {
        // Create a mutable dictionary (hash map) to store process pairs.
        let mut process_dict = GxHashMap::default();

        // Match against the specific string conversion type.
        match *simple_match_type_bit {
            // If no conversion type specified, do nothing.
            SimpleMatchType::None => {}

            // For Fanjian conversion: process FANJIAN and UNICODE data files.
            SimpleMatchType::Fanjian => {
                for str_conv_dat in [FANJIAN, UNICODE] {
                    // Extend the process dictionary with mappings from the conversion data.
                    process_dict.extend(str_conv_dat.trim().split('\n').map(|pair_str| {
                        let mut pair_str_split = pair_str.split('\t');
                        (
                            // Each line in the conversion data corresponds to a key-value pair.
                            pair_str_split.next().unwrap(),
                            pair_str_split.next().unwrap(),
                        )
                    }));
                }
            }

            // For WordDelete conversion: process punctuation and whitespace characters.
            SimpleMatchType::WordDelete => {
                process_dict.extend(
                    PUNCTUATION_SPECIAL
                        .trim()
                        .split('\n')
                        // Map each punctuation character to an empty string (deletion).
                        .map(|pair_str| (pair_str, "")),
                );

                // Map each whitespace character to an empty string (deletion).
                process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
            }

            // For TextDelete conversion: process punctuation, Chinese special, and English special characters.
            SimpleMatchType::TextDelete => {
                for str_conv_dat in [PUNCTUATION_SPECIAL, CN_SPECIAL, EN_SPECIAL] {
                    process_dict.extend(
                        str_conv_dat
                            .trim()
                            .split('\n')
                            // Map each special character to an empty string (deletion).
                            .map(|pair_str| (pair_str, "")),
                    );
                }

                // Map each whitespace character to an empty string (deletion).
                process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
            }
            // For Normalize conversion: process UPPER_LOWER, EN_VARIATION, and NUM_NORM data files.
            SimpleMatchType::Normalize => {
                for str_conv_dat in [UPPER_LOWER, EN_VARIATION, NUM_NORM] {
                    // Extend the process dictionary with mappings from the conversion data.
                    process_dict.extend(str_conv_dat.trim().split('\n').map(|pair_str| {
                        let mut pair_str_split = pair_str.split('\t');
                        (
                            // Each line in the conversion data corresponds to a key-value pair.
                            pair_str_split.next().unwrap(),
                            pair_str_split.next().unwrap(),
                        )
                    }));
                }
            }

            // For PinYin conversion: process PINYIN data file.
            SimpleMatchType::PinYin => {
                process_dict.extend(PINYIN.trim().split('\n').map(|pair_str| {
                    let mut pair_str_split = pair_str.split('\t');
                    (
                        // Each line in the conversion data corresponds to a key-value pair.
                        pair_str_split.next().unwrap(),
                        pair_str_split.next().unwrap(),
                    )
                }));
            }

            // For PinYinChar conversion: process PINYIN_CHAR data file.
            SimpleMatchType::PinYinChar => {
                process_dict.extend(PINYIN_CHAR.trim().split('\n').map(|pair_str| {
                    let mut pair_str_split = pair_str.split('\t');
                    (
                        // Each line in the conversion data corresponds to a key-value pair.
                        pair_str_split.next().unwrap(),
                        pair_str_split.next().unwrap(),
                    )
                }));
            }

            // Ignore unknown or unsupported conversion types.
            _ => {}
        }

        // Remove entries where the key starts with '#' (except key "#")
        // or where the key and value are identical.
        process_dict
            .retain(|&key, &mut value| (key == "#" || !key.starts_with('#')) && key != value);

        // Build an Aho-Corasick automaton (process_matcher) for efficient matching.
        let process_matcher = AhoCorasickBuilder::new()
            .kind(Some(DFA))
            .match_kind(MatchKind::LeftmostLongest)
            .build(
                // Collect all keys (patterns) to be matched.
                process_dict
                    .iter()
                    .map(|(&key, _)| key)
                    .collect::<Vec<&str>>(),
            )
            .unwrap();

        // Collect the corresponding replacement values.
        let process_replace_list = process_dict.iter().map(|(_, &val)| val).collect();

        // Return the tuple containing the replacement list and the process matcher.
        (process_replace_list, process_matcher)
    }

    /// Constructs a SimpleAcTable for a given SimpleMatchType and word map.
    ///
    /// This method creates an Aho-Corasick automaton and its corresponding word configurations
    /// based on the provided SimpleMatchType and word map. It processes the word map to generate
    /// split words and their corresponding configuration for efficient matching.
    ///
    /// # Arguments
    ///
    /// * `simple_match_type` - A reference to a `SimpleMatchType` indicating the type of string
    /// transformation to be applied.
    /// * `simple_word_map` - A reference to an `IntMap<u64, &str>` that maps unique word IDs to
    /// their corresponding string slices.
    ///
    /// # Returns
    ///
    /// A `SimpleAcTable` instance containing the Aho-Corasick automaton (`ac_matcher`) and a list
    /// of word configurations (`ac_word_conf_list`). The automaton is built from processed split
    /// words, and the configuration list maps word IDs to their respective offsets.
    fn build_simple_ac_table<'a, M>(
        &mut self,
        simple_match_type: &SimpleMatchType,
        simple_word_map: M,
    ) -> SimpleAcTable
    where
        M: IntoIterator<Item = (u64, &'a str)>,
    {
        // Initialize vectors to hold the list of Aho-Corasick words and their configurations.
        let mut ac_wordlist = Vec::new();
        let mut ac_word_conf_list = Vec::new();

        // Iterate over each entry in the simple_word_map.
        for (simple_word_id, simple_word) in simple_word_map.into_iter() {
            // Update the minimum character count required for matching.
            self.min_chars_count = self.min_chars_count.min(
                simple_word
                    .chars()
                    .filter(|&c| c != ',') // Exclude commas from the character count.
                    .collect::<GxHashSet<char>>()
                    .len(),
            );

            // Create a counter for split words in the current simple_word.
            let mut ac_split_word_counter = GxHashMap::default();
            for ac_split_word in simple_word.split(',').filter(|&x| !x.is_empty()) {
                ac_split_word_counter
                    .entry(ac_split_word)
                    .and_modify(|cnt| *cnt += 1) // Increment the count if the split word already exists.
                    .or_insert(1); // Initialize the count to 1 if it's a new split word.
            }

            // Create a vector of split bits based on the split word counts, up to the WORD_COMBINATION_LIMIT.
            let split_bit_vec = ac_split_word_counter
                .values()
                .take(WORD_COMBINATION_LIMIT)
                .map(|&x| 1 << (x.min(8) - 1)) // Ensure the bit shift does not exceed 8.
                .collect::<ArrayVec<[u8; 32]>>();
            let split_bit = Simd::load_or_default(&split_bit_vec); // Load split bits into a SIMD vector.

            // Insert the word configuration into the simple_wordconf_map.
            self.simple_wordconf_map.insert(
                simple_word_id,
                WordConf {
                    word: simple_word.to_owned(), // Convert the borrowed string slice to an owned String.
                    split_bit,
                },
            );

            // Process each split word and add it to the Aho-Corasick word list and configuration list.
            for (offset, split_word) in ac_split_word_counter
                .keys()
                .take(WORD_COMBINATION_LIMIT)
                .enumerate()
            {
                for ac_word in self.reduce_text_process(simple_match_type, split_word.as_bytes()) {
                    ac_wordlist.push(ac_word);
                    ac_word_conf_list.push((simple_word_id, offset)); // Track the word ID and its offset.
                }
            }
        }

        // Return a SimpleAcTable instance with the built Aho-Corasick matcher and word configurations.
        SimpleAcTable {
            ac_matcher: AhoCorasickBuilder::new()
                .kind(Some(DFA))
                .ascii_case_insensitive(true) // Enable case-insensitive matching.
                .build(&ac_wordlist)
                .unwrap(), // Build the Aho-Corasick matcher and handle any errors.
            ac_word_conf_list,
        }
    }

    #[inline]
    /// Processes the text through a sequence of transformations based on the `SimpleMatchType`.
    ///
    /// This method applies various string transformations to the input text bytes, producing
    /// multiple versions of the text according to the rules specified in the `SimpleMatchType`.
    /// Each transformation step is performed using pre-built Aho-Corasick matchers and their
    /// replacement rules, and the processed results are stored in a vector.
    ///
    /// The processing is done by iterating over each bit of the `SimpleMatchType`, fetching
    /// the corresponding processor from the map, and applying the transformations. Depending on the
    /// match type, transformations could involve substitutions, deletions, or other modifications.
    ///
    /// # Arguments
    ///
    /// * `simple_match_type` - A reference to a `SimpleMatchType` that indicates the sequence of
    /// transformations to be applied.
    /// * `text_bytes` - A byte slice containing the input text to be processed.
    ///
    /// # Returns
    ///
    /// * An `ArrayVec` containing up to 4 versions of the processed text. Each version represents
    /// a partial or fully transformed state of the original text according to the rules specified
    /// in the `SimpleMatchType`.
    ///
    /// # Processing Logic
    ///
    /// * For each bit in the `SimpleMatchType`, fetch the corresponding replacement list
    ///   and matcher.
    /// * Check if the current text bytes match any patterns in the matcher.
    /// * Perform the specified transformation if a match is found:
    ///   * For `None`, do nothing.
    ///   * For `Fanjian`, replace all occurrences of patterns.
    ///   * For `TextDelete` and `WordDelete`, delete matched patterns and concatenate
    ///     the remaining text.
    ///   * For other types, replace matched patterns with their corresponding replacements.
    fn reduce_text_process<'a>(
        &self,
        simple_match_type: &SimpleMatchType,
        text_bytes: &'a [u8],
    ) -> ArrayVec<[Cow<'a, [u8]>; 4]> {
        // Initialize an ArrayVec to store processed text byte arrays, starting with the original text bytes.
        let mut processed_text_bytes_list: ArrayVec<[Cow<'a, [u8]>; 4]> = ArrayVec::new();
        processed_text_bytes_list.push(Cow::Borrowed(text_bytes));

        // Iterate over each bit in the SimpleMatchType.
        for simple_match_type_bit in simple_match_type.iter() {
            // Fetch the corresponding replacement list and matcher for the current SimpleMatchType bit.
            let (process_replace_list, process_matcher) = unsafe {
                self.simple_match_type_process_map
                    .get(&simple_match_type_bit)
                    .unwrap_unchecked()
            };
            // Get the last processed text bytes from the list.
            let tmp_processed_text_bytes =
                unsafe { processed_text_bytes_list.last_mut().unwrap_unchecked() };

            // Match against the specific SimpleMatchType bit and perform appropriate transformations.
            match simple_match_type_bit {
                // None type requires no processing.
                SimpleMatchType::None => {}
                // Fanjian type performs replacement for all pattern matches.
                SimpleMatchType::Fanjian => {
                    // If a match is found, replace all occurrences of patterns in the text.
                    if unlikely(process_matcher.is_match(tmp_processed_text_bytes.as_ref())) {
                        *tmp_processed_text_bytes = Cow::Owned(
                            process_matcher.replace_all_bytes(text_bytes, process_replace_list),
                        );
                    }
                }
                // TextDelete and WordDelete types perform deletion of matched patterns.
                SimpleMatchType::TextDelete | SimpleMatchType::WordDelete => {
                    // If a match is likely, proceed with the deletion process.
                    if likely(process_matcher.is_match(tmp_processed_text_bytes.as_ref())) {
                        // Create a vector to store the bytes of the processed text.
                        let mut processed_text_bytes =
                            Vec::with_capacity(tmp_processed_text_bytes.len());
                        let mut last_match = 0;

                        // Iterate over all matches and build the processed text by excluding matched patterns.
                        for mat in process_matcher.find_iter(tmp_processed_text_bytes.as_ref()) {
                            processed_text_bytes.extend(unsafe {
                                tmp_processed_text_bytes.get_unchecked(last_match..mat.start())
                            });
                            last_match = mat.end();
                        }
                        // Add the remaining part of the text after the last match.
                        processed_text_bytes.extend(unsafe {
                            tmp_processed_text_bytes.get_unchecked(last_match..)
                        });

                        // Add the processed text to the list.
                        processed_text_bytes_list.push(Cow::Owned(processed_text_bytes));
                    }
                }
                // For other types, replace occurrences of patterns with corresponding replacements.
                _ => {
                    // If a match is found, replace occurrences of patterns and add the processed text to the list.
                    if process_matcher.is_match(tmp_processed_text_bytes.as_ref()) {
                        let processed_text_bytes = process_matcher
                            .replace_all_bytes(tmp_processed_text_bytes, process_replace_list);
                        processed_text_bytes_list.push(Cow::Owned(processed_text_bytes));
                    }
                }
            }
        }

        // Return the list of processed text byte arrays.
        processed_text_bytes_list
    }
}

impl<'a> TextMatcherTrait<'a, SimpleResult<'a>> for SimpleMatcher {
    /// Determines if there is a match for the input text.
    ///
    /// This method processes the given text through various transformation rules defined
    /// by the `SimpleMatchType`. It uses the Aho-Corasick algorithm to search for overlapping
    /// patterns within the processed text. If any sequence of transformations leads to a
    /// complete match (as indicated by the `split_bit_vec`), the method returns `true`.
    ///
    /// # Arguments
    ///
    /// * `text` - A string slice representing the input text to be matched.
    ///
    /// # Returns
    ///
    /// * `bool` - Returns `true` if a match is found; otherwise, returns `false`.
    ///
    /// This method follows these steps:
    ///
    /// 1. Convert the input text to a byte slice.
    /// 2. If the number of characters in the byte slice is fewer than the minimum required character count (`min_chars_count`), return `false`.
    /// 3. Initialize a map (`word_id_split_bit_map`) to keep track of the split bit vectors for each word ID.
    /// 4. For each `SimpleMatchType` and its associated `SimpleAcTable` in the `simple_match_type_ac_table_map`:
    ///     a. Process the text according to the transformation rules defined by the `SimpleMatchType`.
    ///     b. Iterate over each processed version of the text.
    ///     c. For each processed text, find overlapping patterns using the Aho-Corasick matcher.
    ///     d. For each pattern found, update the split bit vector corresponding to the word ID.
    ///     e. If any word ID has its split bit vector reduced to all zeros, return `true`.
    /// 5. If no match is found after processing all transformations and patterns, return `false`.
    fn is_match(&self, text: &str) -> bool {
        // Convert the input text to a byte slice.
        let text_bytes = text.as_bytes();

        // Check if the number of characters in the byte slice is fewer than the minimum required character count.
        if unlikely(bytecount::num_chars(text_bytes) < self.min_chars_count) {
            return false; // Return false if the character count is too low.
        }

        // Initialize a map to keep track of the split bit vectors for each word ID.
        let mut word_id_split_bit_map = IntMap::default();

        // Iterate over each SimpleMatchType and its associated SimpleAcTable.
        for (simple_match_type, simple_ac_table) in &self.simple_match_type_ac_table_map {
            // Process the text according to the transformation rules defined by SimpleMatchType.
            let processed_text_bytes_list = self.reduce_text_process(simple_match_type, text_bytes);
            let processed_times = processed_text_bytes_list.len();

            // Iterate over each processed version of the text.
            for (index, processed_text) in processed_text_bytes_list.iter().enumerate() {
                // Use the Aho-Corasick matcher to find overlapping patterns in the processed text.
                for ac_result in simple_ac_table
                    .ac_matcher
                    .find_overlapping_iter(processed_text)
                {
                    let ac_word_id = ac_result.pattern().as_usize();
                    let ac_word_conf =
                        unsafe { simple_ac_table.ac_word_conf_list.get_unchecked(ac_word_id) };

                    let word_id = ac_word_conf.0;
                    let word_conf =
                        unsafe { self.simple_wordconf_map.get(&word_id).unwrap_unchecked() };

                    // Get or initialize the split bit vector corresponding to the word ID.
                    let split_bit_vec = word_id_split_bit_map.entry(word_id).or_insert_with(|| {
                        iter::repeat_n(word_conf.split_bit, processed_times)
                            .collect::<ArrayVec<[_; 4]>>()
                    });

                    // Update the split bit vector by shifting the bit to the right.
                    *unsafe {
                        split_bit_vec
                            .get_unchecked_mut(index)
                            .as_mut_array()
                            .get_unchecked_mut(ac_word_conf.1)
                    } >>= 1;

                    // Check if all shifts have reduced the split bit vector to all zeros.
                    if unlikely(
                        split_bit_vec
                            .iter()
                            .fold(Simd::splat(1), |acc, &bit| acc & bit)
                            == ZEROS,
                    ) {
                        return true; // Return true if a complete match is found.
                    }
                }
            }
        }

        // Return false if no match is found after processing all transformations and patterns.
        false
    }

    /// Processes the input text and returns a list of `SimpleResult` instances representing matched words.
    ///
    /// This method processes the input text according to various transformation rules defined by the
    /// `SimpleMatchType`. It uses the Aho-Corasick algorithm to search for overlapping patterns within the
    /// processed text. The matched words are then collected and returned as a vector of `SimpleResult` instances.
    ///
    /// # Arguments
    ///
    /// * `text` - A string slice representing the input text to be processed and matched.
    ///
    /// # Returns
    ///
    /// * `Vec<SimpleResult<'a>>` - A vector of `SimpleResult` instances representing matched words, each containing
    ///   the word's unique identifier and the matched word itself.
    ///
    /// # Processing Logic
    ///
    /// * Convert the input text to a byte slice.
    /// * If the number of characters in the byte slice is fewer than the minimum required character count, return an empty result list.
    /// * Initialize a set (`word_id_set`) to keep track of word IDs that have been matched, and a map (`word_id_split_bit_map`)
    ///   to track split bit vectors for each word ID.
    /// * For each `SimpleMatchType` and its associated `SimpleAcTable` in the `simple_match_type_ac_table_map`:
    ///     a. Process the text according to the transformation rules defined by the `SimpleMatchType`.
    ///     b. Iterate over each processed version of the text.
    ///     c. For each processed text, find overlapping patterns using the Aho-Corasick matcher.
    ///     d. For each pattern found, update the split bit vector corresponding to the word ID.
    ///     e. If any word ID has its split bit vector reduced to all zeros, add the word to the result list.
    ///
    /// This method ensures that each matched word is processed efficiently using SIMD vectors and Aho-Corasick automata.
    fn process(&'a self, text: &str) -> Vec<SimpleResult<'a>> {
        let text_bytes = text.as_bytes(); // Convert the input text to a byte slice
        let mut result_list = Vec::new(); // Initialize an empty vector to store the results

        // Check if the number of characters in the byte slice is fewer than the minimum required character count
        if unlikely(bytecount::num_chars(text_bytes) < self.min_chars_count) {
            return result_list; // Return an empty result list if the character count is too low
        }

        let mut word_id_set = IntSet::default(); // Initialize a set to keep track of matched word IDs
        let mut word_id_split_bit_map = IntMap::default(); // Initialize a map to track split bit vectors for each word ID

        // Iterate over each SimpleMatchType and its associated SimpleAcTable
        for (simple_match_type, simple_ac_table) in &self.simple_match_type_ac_table_map {
            // Process the text according to the transformation rules defined by SimpleMatchType
            let processed_text_bytes_list = self.reduce_text_process(simple_match_type, text_bytes);
            let processed_times = processed_text_bytes_list.len(); // Get the number of processed versions of the text

            // Iterate over each processed version of the text
            for (index, processed_text) in processed_text_bytes_list.iter().enumerate() {
                // Use the Aho-Corasick matcher to find overlapping patterns in the processed text
                for ac_result in simple_ac_table
                    .ac_matcher
                    .find_overlapping_iter(processed_text)
                {
                    // Retrieve the word configuration based on the pattern found
                    let ac_word_conf = unsafe {
                        simple_ac_table
                            .ac_word_conf_list
                            .get_unchecked(ac_result.pattern().as_usize())
                    };
                    let word_id = ac_word_conf.0; // Extract the word ID from the word configuration

                    // If the word ID is already in the set, skip further processing
                    if word_id_set.contains(&word_id) {
                        continue;
                    }

                    // Get the word configuration from the simple_wordconf_map
                    let word_conf =
                        unsafe { self.simple_wordconf_map.get(&word_id).unwrap_unchecked() };

                    // Get or initialize the split bit vector corresponding to the word ID
                    let split_bit_vec = word_id_split_bit_map.entry(word_id).or_insert_with(|| {
                        iter::repeat_n(word_conf.split_bit, processed_times)
                            .collect::<ArrayVec<[_; 4]>>()
                    });

                    // Update the split bit vector by shifting the bit to the right
                    *unsafe {
                        split_bit_vec
                            .get_unchecked_mut(index)
                            .as_mut_array()
                            .get_unchecked_mut(ac_word_conf.1)
                    } >>= 1;

                    // Check if all shifts have reduced the split bit vector to all zeros
                    if unlikely(
                        split_bit_vec
                            .iter()
                            .fold(Simd::splat(1), |acc, &bit| acc & bit)
                            == ZEROS,
                    ) {
                        word_id_set.insert(word_id); // Add the word ID to the set of matched word IDs
                                                     // Add the matched word to the result list
                        result_list.push(SimpleResult {
                            word_id,
                            word: Cow::Borrowed(&word_conf.word),
                        });
                    }
                }
            }
        }

        result_list // Return the list of matched words
    }
}