1use aho_corasick::{AhoCorasick, MatchKind};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
8pub struct KeywordMatch {
9 pub keyword: String,
10 pub value: String,
11 pub start: usize,
12 pub end: usize,
13}
14
15pub struct FlashText {
30 keywords: HashMap<String, String>,
31 matcher: Option<AhoCorasick>,
32 pattern_list: Vec<String>,
33 case_insensitive: bool,
34}
35
36impl FlashText {
37 pub fn new() -> Self {
38 Self {
39 keywords: HashMap::new(),
40 matcher: None,
41 pattern_list: Vec::new(),
42 case_insensitive: true,
43 }
44 }
45
46 pub fn add_keyword(&mut self, keyword: impl Into<String>, value: impl Into<String>) {
47 let kw = keyword.into();
48 let val = value.into();
49 self.keywords.insert(kw.clone(), val);
50 self.pattern_list.push(kw);
51 self.matcher = None;
52 }
53
54 fn ensure_built(&mut self) {
55 if self.matcher.is_none() {
56 let ac = AhoCorasick::builder()
57 .match_kind(MatchKind::LeftmostLongest)
58 .ascii_case_insensitive(self.case_insensitive)
59 .build(&self.pattern_list)
60 .expect("failed to build Aho-Corasick matcher");
61 self.matcher = Some(ac);
62 }
63 }
64
65 pub fn find_into(&mut self, text: &str, out: &mut Vec<KeywordMatch>) {
69 out.clear();
70 self.ensure_built();
71 let matcher = self.matcher.as_ref().unwrap();
72
73 let mut last_byte = 0usize;
76 let mut last_char = 0usize;
77
78 for mat in matcher.find_iter(text) {
79 let pattern = &self.pattern_list[mat.pattern()];
80 let value = self
81 .keywords
82 .get(pattern)
83 .cloned()
84 .unwrap_or_else(|| pattern.clone());
85
86 if mat.start() >= last_byte {
88 last_char += text[last_byte..mat.start()].chars().count();
89 } else {
90 last_char = text[..mat.start()].chars().count();
92 }
93 let start = last_char;
94 let len = text[mat.start()..mat.end()].chars().count();
95
96 out.push(KeywordMatch {
97 keyword: pattern.clone(),
98 value,
99 start,
100 end: start + len,
101 });
102
103 last_byte = mat.end();
105 last_char = start + len;
106 }
107 }
108
109 pub fn find(&mut self, text: &str) -> Vec<KeywordMatch> {
110 let mut matches = Vec::new();
111 self.find_into(text, &mut matches);
112 matches
113 }
114}
115
116impl Default for FlashText {
117 fn default() -> Self {
118 Self::new()
119 }
120}
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125
126 #[test]
127 fn test_find_char_offsets_are_correct_for_unicode() {
128 let mut ft = FlashText::new();
129 ft.add_keyword("東京", "tokyo");
130 ft.add_keyword("Müller", "muller");
131
132 let text = "a 東京 b Müller c";
134 let matches = ft.find(text);
135
136 assert_eq!(
153 matches,
154 vec![
155 KeywordMatch {
156 keyword: "東京".to_string(),
157 value: "tokyo".to_string(),
158 start: 2,
159 end: 4
160 },
161 KeywordMatch {
162 keyword: "Müller".to_string(),
163 value: "muller".to_string(),
164 start: 7,
165 end: 13
166 }
167 ]
168 );
169 }
170}