1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9pub fn segment_forward_longest<T: AsRef<str>>(
11 text: T,
12 dict: &Dictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text
16 .as_ref()
17 .to_lowercase();
18
19 let mut results: Vec<Match> = vec![];
20
21 let mut unconsumed_start_index: Option<usize> = None;
22 let mut maximum_matched_end_index = 0;
23 let mut start_index = 0;
24 while start_index < text.len() {
25 if text.is_char_boundary(start_index) {
26 let mut next_start_index = start_index + 1;
27
28 let mut matched_results: Vec<Match> = vec![];
29 let mut longest_match: Option<(
30 usize, usize, )> = None;
33 ((start_index + 1)..=text.len())
34 .into_iter()
35 .for_each(|end_index| {
36 if text.is_char_boundary(end_index) {
37 let sub_text = &text[start_index..end_index];
38
39 if let Some(value) = dict.map.get(sub_text) {
40 longest_match = Some((end_index, *value))
41 }
42 }
43 });
44
45 if let Some((end_index, value)) = longest_match{
46 let range = TextRange::new(
47 start_index,
48 end_index
49 );
50
51 let result = Match::new(range, Some(value));
52 matched_results.push(result);
53
54 next_start_index = range.end_index();
55 maximum_matched_end_index = range.end_index();
56 }
57
58 let mut unmatched_results: Vec<Match> = vec![];
59 match behavior_for_unmatched {
60 BehaviorForUnmatched::KeepAsWords => {
61 if matched_results.len() > 0 {
62 if let Some(index) = unconsumed_start_index {
64 let result = Match::new(
65 TextRange::new(index, start_index),
66 None,
67 );
68 unmatched_results.push(result);
69 unconsumed_start_index = None;
70 }
71 } else {
72 if start_index >= maximum_matched_end_index {
73 if let None = unconsumed_start_index {
74 unconsumed_start_index = Some(start_index);
75 }
76 }
77 }
78 },
79 BehaviorForUnmatched::KeepAsChars => {
80 if matched_results.len() > 0 {
81 if let Some(index) = unconsumed_start_index {
83 let result = Match::new(
84 TextRange::new(index, start_index),
85 None,
86 );
87 unmatched_results.push(result);
88 unconsumed_start_index = None;
89 }
90 } else {
91 if start_index >= maximum_matched_end_index {
92 if let None = unconsumed_start_index {
93 unconsumed_start_index = Some(start_index);
94 }
95 }
96 }
97 },
98 BehaviorForUnmatched::Ignore => (),
99 }
100
101 results.append(&mut unmatched_results);
102 results.append(&mut matched_results);
103
104 start_index = next_start_index;
105 } else {
106 start_index += 1;
107 }
108 }
109 if maximum_matched_end_index < text.len() {
110 match behavior_for_unmatched {
112 BehaviorForUnmatched::KeepAsWords => {
113 results.push(Match::new(
114 TextRange::new(maximum_matched_end_index, text.len()),
115 None
116 ))
117 },
118 BehaviorForUnmatched::KeepAsChars => {
119 let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
120 .map(|range| {
121 Match::new(
122 TextRange::new(
123 maximum_matched_end_index + range.start_index(),
124 maximum_matched_end_index + range.end_index(),
125 ),
126 None
127 )
128 });
129
130 results.extend(iter);
131 }
132 BehaviorForUnmatched::Ignore => (),
133 }
134 }
135
136 results
137}
138
139#[cfg(test)]
140mod tests {
141 use crate::BehaviorForUnmatched;
142 use crate::hashmap::{
143 segment_forward_longest,
144 Dictionary,
145 };
146
147 #[test]
148 fn test_ignore_unmatched() {
149 let text = " 商品和服务, hello world ";
150 let dict = Dictionary::new(
151 vec!["商品", "和服", "服务", "你好世界"]
152 ).unwrap();
153
154 let result = segment_forward_longest(
155 text,
156 &dict,
157 BehaviorForUnmatched::Ignore
158 );
159
160 assert_eq!(
161 result
162 .into_iter()
163 .map(|x| x.range().extract(text).unwrap())
164 .collect::<Vec<_>>(),
165 vec!["商品", "和服",]
166 );
167 }
168
169 #[test]
170 fn test_keep_unmatched_as_chars() {
171 let text = " 商品和服务, hello world ";
172 let dict = Dictionary::new(
173 vec!["商品", "和服", "服务", "你好世界"]
174 ).unwrap();
175
176 let result = segment_forward_longest(
177 text,
178 &dict,
179 BehaviorForUnmatched::KeepAsChars
180 );
181
182 assert_eq!(
183 result
184 .into_iter()
185 .map(|x| x.range().extract(text).unwrap())
186 .collect::<Vec<_>>(),
187 vec![
188 " ",
189 "商品",
190 "和服",
191 "务",
192 ",",
193 " ",
194 "h",
195 "e",
196 "l",
197 "l",
198 "o",
199 " ",
200 "w",
201 "o",
202 "r",
203 "l",
204 "d",
205 " ",
206 ],
207 );
208 }
209
210 #[test]
211 fn test_keep_unmatched_as_words() {
212 let text = " 商品和服务, hello world ";
213 let dict = Dictionary::new(
214 vec!["商品", "和服", "服务", "你好世界"]
215 ).unwrap();
216
217 let result = segment_forward_longest(
218 text,
219 &dict,
220 BehaviorForUnmatched::KeepAsWords
221 );
222
223 assert_eq!(
224 result
225 .into_iter()
226 .map(|x| x.range().extract(text).unwrap())
227 .collect::<Vec<_>>(),
228 vec![
229 " ",
230 "商品",
231 "和服",
232 "务, hello world ",
233 ],
234 );
235 }
236
237 #[test]
238 fn test_value() {
239 let text = " 商品和服务, hello world ";
240 let dict = Dictionary::new(
241 vec![
242 "商品",
243 "和服",
244 "服务",
245 "你好世界",
246 ]
247 ).unwrap();
248
249 let result = segment_forward_longest(
250 text,
251 &dict,
252 BehaviorForUnmatched::Ignore
253 );
254
255 assert_eq!(
256 result
257 .into_iter()
258 .map(|x| x.index_of_patterns().unwrap())
259 .collect::<Vec<_>>(),
260 vec![0, 1]
261 );
262 }
263
264 #[test]
265 fn test_chars_on_edge() {
266 let text = "你好世界";
267 let dict = Dictionary::new(
268 vec!["你好", "世界"]
269 ).unwrap();
270
271 let result = segment_forward_longest(
272 text,
273 &dict,
274 BehaviorForUnmatched::Ignore
275 );
276
277 assert_eq!(
278 result
279 .into_iter()
280 .map(|x| x.range().extract(text).unwrap())
281 .collect::<Vec<_>>(),
282 vec!["你好", "世界"]
283 );
284 }
285}