1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9pub fn segment_backward_longest<T: AsRef<str>>(
11 text: T,
12 dict: &Dictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text
16 .as_ref()
17 .to_lowercase();
18
19 let mut results: Vec<Match> = vec![];
20
21 let mut unconsumed_end_index: Option<usize> = None;
22 let mut minimum_matched_start_index = text.len();
23 let mut end_index = text.len();
24 while end_index > 0 {
25 if text.is_char_boundary(end_index) {
26 let mut next_end_index = end_index - 1;
27
28 let mut matched_results: Vec<Match> = vec![];
29 let mut longest_match: Option<(
30 usize, usize, )> = None;
33 (0..end_index)
34 .rev()
35 .into_iter()
36 .for_each(|start_index| {
37 if text.is_char_boundary(start_index) {
38 let sub_text = &text[start_index..end_index];
39
40 if let Some(value) = dict.map.get(sub_text) {
41 longest_match = Some((start_index, *value))
42 }
43 }
44 });
45
46 if let Some((start_index, value)) = longest_match {
47 let range = TextRange::new(
48 start_index,
49 end_index,
50 );
51
52 let result = Match::new(range, Some(value));
53 matched_results.push(result);
54
55 next_end_index = start_index;
56 minimum_matched_start_index = start_index;
57 }
58
59 let mut unmatched_results: Vec<Match> = {
60 let mut unmatched_results: Vec<Match> = vec![];
61
62 match behavior_for_unmatched {
63 BehaviorForUnmatched::KeepAsWords => {
64 if matched_results.len() > 0 {
65 if let Some(index) = unconsumed_end_index {
67 let result = Match::new(
68 TextRange::new(
69 end_index,
70 index,
71 ),
72 None,
73 );
74 unmatched_results.push(result);
75 unconsumed_end_index = None;
76 }
77 } else {
78 if end_index <= minimum_matched_start_index {
79 if let None = unconsumed_end_index {
80 unconsumed_end_index = Some(end_index);
81 }
82 }
83 }
84 },
85 BehaviorForUnmatched::KeepAsChars => {
86 if matched_results.len() > 0 {
87 if let Some(index) = unconsumed_end_index {
89 let iter = split_as_char_ranges(&text[end_index..index])
90 .map(|range| {
91 Match::new(
92 TextRange::new(
93 end_index + range.start_index(),
94 end_index + range.end_index(),
95 ),
96 None,
97 )
98 });
99
100 unmatched_results.extend(iter);
101 unmatched_results.reverse();
102 unconsumed_end_index = None;
103 }
104 } else {
105 if end_index >= minimum_matched_start_index {
106 if let None = unconsumed_end_index {
107 unconsumed_end_index = Some(end_index);
108 }
109 }
110 }
111 },
112 BehaviorForUnmatched::Ignore => (),
113 }
114
115 unmatched_results
116 };
117
118 results.append(&mut unmatched_results);
119 results.append(&mut matched_results);
120
121 end_index = next_end_index;
122 } else {
123 end_index -= 1;
124 }
125 }
126 if minimum_matched_start_index > 0 {
127 match behavior_for_unmatched {
129 BehaviorForUnmatched::KeepAsWords => {
130 results.push(Match::new(
131 TextRange::new(
132 0,
133 minimum_matched_start_index,
134 ),
135 None
136 ));
137 },
138 BehaviorForUnmatched::KeepAsChars => {
139 let iter = split_as_char_ranges(&text[0..minimum_matched_start_index])
140 .map(|range| {
141 Match::new(
142 TextRange::new(
143 range.start_index(),
144 range.end_index(),
145 ),
146 None
147 )
148 });
149
150 results.extend(iter);
151 }
152 BehaviorForUnmatched::Ignore => (),
153 }
154 }
155
156 results.reverse();
157
158 results
159}
160
161#[cfg(test)]
162mod tests {
163 use crate::BehaviorForUnmatched;
164 use crate::hashmap::{
165 segment_backward_longest,
166 Dictionary,
167 };
168
169 #[test]
170 fn test_ignore_unmatched() {
171 let text = " 商品和服务, hello world ";
172 let dict = Dictionary::new(
173 vec!["商品", "和服", "服务", "你好世界"]
174 ).unwrap();
175
176 let result = segment_backward_longest(
177 text,
178 &dict,
179 BehaviorForUnmatched::Ignore
180 );
181
182 assert_eq!(
183 result
184 .iter()
185 .map(|x| x.range().extract(text).unwrap())
186 .collect::<Vec<_>>(),
187 vec!["商品", "服务",]
188 );
189 }
190
191 #[test]
192 fn test_keep_unmatched_as_chars() {
193 let text = " 商品和服务, hello world ";
194 let dict = Dictionary::new(
195 vec!["商品", "和服", "服务", "你好世界"]
196 ).unwrap();
197
198 let result = segment_backward_longest(
199 text,
200 &dict,
201 BehaviorForUnmatched::KeepAsChars
202 );
203
204 assert_eq!(
205 result
206 .into_iter()
207 .map(|x| x.range().extract(text).unwrap())
208 .collect::<Vec<_>>(),
209 vec![
210 " ",
211 "商品",
212 "和",
213 "服务",
214 ",",
215 " ",
216 "h",
217 "e",
218 "l",
219 "l",
220 "o",
221 " ",
222 "w",
223 "o",
224 "r",
225 "l",
226 "d",
227 " ",
228 ]
229 );
230 }
231
232 #[test]
233 fn test_keep_unmatched_as_words() {
234 let text = " 商品和服务, hello world ";
235 let dict = Dictionary::new(
236 vec!["商品", "和服", "服务", "你好世界"]
237 ).unwrap();
238
239 let result = segment_backward_longest(
240 text,
241 &dict,
242 BehaviorForUnmatched::KeepAsWords
243 );
244
245 assert_eq!(
246 result
247 .into_iter()
248 .map(|x| x.range().extract(text).unwrap())
249 .collect::<Vec<_>>(),
250 vec![
251 " ",
252 "商品",
253 "和",
254 "服务",
255 ", hello world ",
256 ]
257 );
258 }
259
260 #[test]
261 fn test_value() {
262 let text = " 商品和服务, hello world ";
263 let dict = Dictionary::new(
264 vec![
265 "商品",
266 "和服",
267 "服务",
268 "你好世界",
269 ]
270 ).unwrap();
271
272 let result = segment_backward_longest(
273 text,
274 &dict,
275 BehaviorForUnmatched::Ignore
276 );
277
278 assert_eq!(
279 result
280 .into_iter()
281 .map(|x| x.index_of_patterns().unwrap())
282 .collect::<Vec<_>>(),
283 vec![0, 2]
284 );
285 }
286
287 #[test]
288 fn test_chars_on_edge() {
289 let text = "你好世界";
290 let dict = Dictionary::new(
291 vec!["你好", "世界"]
292 ).unwrap();
293
294 let result = segment_backward_longest(
295 text,
296 &dict,
297 BehaviorForUnmatched::Ignore
298 );
299
300 assert_eq!(
301 result
302 .into_iter()
303 .map(|x| x.range().extract(text).unwrap())
304 .collect::<Vec<_>>(),
305 vec!["你好", "世界"]
306 );
307 }
308}