Skip to main content

textprep/
flash.rs

1//! Fast keyword matching using Aho-Corasick.
2
3use aho_corasick::{AhoCorasick, MatchKind};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
8pub struct KeywordMatch {
9    pub keyword: String,
10    pub value: String,
11    pub start: usize,
12    pub end: usize,
13}
14
15/// Fast keyword matching using Aho-Corasick.
16///
17/// ```
18/// use textprep::FlashText;
19///
20/// let mut ft = FlashText::new();
21/// ft.add_keyword("New York", "NYC");
22/// ft.add_keyword("San Francisco", "SF");
23///
24/// let matches = ft.find("I flew from New York to San Francisco.");
25/// assert_eq!(matches.len(), 2);
26/// assert_eq!(matches[0].value, "NYC");
27/// assert_eq!(matches[1].value, "SF");
28/// ```
29pub struct FlashText {
30    keywords: HashMap<String, String>,
31    matcher: Option<AhoCorasick>,
32    pattern_list: Vec<String>,
33    case_insensitive: bool,
34}
35
36impl FlashText {
37    pub fn new() -> Self {
38        Self {
39            keywords: HashMap::new(),
40            matcher: None,
41            pattern_list: Vec::new(),
42            case_insensitive: true,
43        }
44    }
45
46    pub fn add_keyword(&mut self, keyword: impl Into<String>, value: impl Into<String>) {
47        let kw = keyword.into();
48        let val = value.into();
49        self.keywords.insert(kw.clone(), val);
50        self.pattern_list.push(kw);
51        self.matcher = None;
52    }
53
54    fn ensure_built(&mut self) {
55        if self.matcher.is_none() {
56            let ac = AhoCorasick::builder()
57                .match_kind(MatchKind::LeftmostLongest)
58                .ascii_case_insensitive(self.case_insensitive)
59                .build(&self.pattern_list)
60                .expect("failed to build Aho-Corasick matcher");
61            self.matcher = Some(ac);
62        }
63    }
64
65    /// Find all keyword matches, writing results into `out`.
66    ///
67    /// `out` is cleared first. This is useful when you want to reuse allocations in hot loops.
68    pub fn find_into(&mut self, text: &str, out: &mut Vec<KeywordMatch>) {
69        out.clear();
70        self.ensure_built();
71        let matcher = self.matcher.as_ref().unwrap();
72
73        // `aho-corasick` yields byte offsets. Convert to char offsets in a single pass
74        // by incrementally advancing from the last match boundary.
75        let mut last_byte = 0usize;
76        let mut last_char = 0usize;
77
78        for mat in matcher.find_iter(text) {
79            let pattern = &self.pattern_list[mat.pattern()];
80            let value = self
81                .keywords
82                .get(pattern)
83                .cloned()
84                .unwrap_or_else(|| pattern.clone());
85
86            // Advance char counter from last match end → current match start.
87            if mat.start() >= last_byte {
88                last_char += text[last_byte..mat.start()].chars().count();
89            } else {
90                // Defensive: should not happen for `find_iter` (monotonic), but keep correctness.
91                last_char = text[..mat.start()].chars().count();
92            }
93            let start = last_char;
94            let len = text[mat.start()..mat.end()].chars().count();
95
96            out.push(KeywordMatch {
97                keyword: pattern.clone(),
98                value,
99                start,
100                end: start + len,
101            });
102
103            // Update last boundary to end of match.
104            last_byte = mat.end();
105            last_char = start + len;
106        }
107    }
108
109    pub fn find(&mut self, text: &str) -> Vec<KeywordMatch> {
110        let mut matches = Vec::new();
111        self.find_into(text, &mut matches);
112        matches
113    }
114}
115
116impl Default for FlashText {
117    fn default() -> Self {
118        Self::new()
119    }
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_find_char_offsets_are_correct_for_unicode() {
128        let mut ft = FlashText::new();
129        ft.add_keyword("東京", "tokyo");
130        ft.add_keyword("Müller", "muller");
131
132        // Mixed ASCII + multibyte.
133        let text = "a 東京 b Müller c";
134        let matches = ft.find(text);
135
136        // Char offsets in `text`:
137        // 0 a
138        // 1 ' '
139        // 2 東
140        // 3 京
141        // 4 ' '
142        // 5 b
143        // 6 ' '
144        // 7 M
145        // 8 ü
146        // 9 l
147        // 10 l
148        // 11 e
149        // 12 r
150        // 13 ' '
151        // 14 c
152        assert_eq!(
153            matches,
154            vec![
155                KeywordMatch {
156                    keyword: "東京".to_string(),
157                    value: "tokyo".to_string(),
158                    start: 2,
159                    end: 4
160                },
161                KeywordMatch {
162                    keyword: "Müller".to_string(),
163                    value: "muller".to_string(),
164                    start: 7,
165                    end: 13
166                }
167            ]
168        );
169    }
170}