1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched,
6 UltraNLPError,
7};
8use crate::cedarwood::BackwardDictionary;
9
10pub fn segment_backward_longest<T: AsRef<str>>(
12 text: T,
13 dict: &BackwardDictionary,
14 behavior_for_unmatched: BehaviorForUnmatched,
15) -> Vec<Match> {
16 let text = text
17 .as_ref()
18 .to_lowercase()
19 .chars()
20 .rev()
21 .collect::<String>();
22
23 let mut results: Vec<Match> = vec![];
24
25 let mut unconsumed_start_index: Option<usize> = None;
26 let mut maximum_matched_end_index = 0;
27 let mut start_index = 0;
28 while start_index < text.len() {
29 if text.is_char_boundary(start_index) {
30 let mut next_start_index = start_index + 1;
31
32 let mut matched_results: Vec<Match> = vec![];
33 if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
35 let longest_match: Option<(i32, usize)> = matches
36 .into_iter()
37 .reduce(| longest, current | {
38 let (_, longest_length) = longest;
39 let (_, current_length) = current;
40 if current_length > longest_length {
41 current
42 } else {
43 longest
44 }
45 });
46
47 if let Some((id, length)) = longest_match {
48 let end_index = start_index + length + 1;
49 let range = TextRange::new(
50 text.len() - end_index,
51 text.len() - start_index,
52 );
53 let value = usize::try_from(id)
54 .map_err(|err| UltraNLPError::new(err.to_string()))
55 .unwrap();
57
58 let result = Match::new(range, Some(value));
59 matched_results.push(result);
60
61 next_start_index = start_index + length + 1;
62 maximum_matched_end_index = start_index + length + 1;
63 }
64 }
65
66 let mut unmatched_results: Vec<Match> = {
67 let mut unmatched_results: Vec<Match> = vec![];
68
69 match behavior_for_unmatched {
70 BehaviorForUnmatched::KeepAsWords => {
71 if matched_results.len() > 0 {
72 if let Some(index) = unconsumed_start_index {
74 let result = Match::new(
75 TextRange::new(
76 text.len() - start_index,
77 text.len() - index,
78 ),
79 None,
80 );
81 unmatched_results.push(result);
82 unconsumed_start_index = None;
83 }
84 } else {
85 if start_index >= maximum_matched_end_index {
86 if let None = unconsumed_start_index {
87 unconsumed_start_index = Some(start_index);
88 }
89 }
90 }
91 },
92 BehaviorForUnmatched::KeepAsChars => {
93 if matched_results.len() > 0 {
94 if let Some(index) = unconsumed_start_index {
96 let iter = split_as_char_ranges(&text[index..start_index])
97 .map(|range| {
98 Match::new(
99 TextRange::new(
100 text.len() - (index + range.end_index()),
101 text.len() - (index + range.start_index()),
102 ),
103 None,
104 )
105 });
106
107 unmatched_results.extend(iter);
108 unconsumed_start_index = None;
109 }
110 } else {
111 if start_index >= maximum_matched_end_index {
112 if let None = unconsumed_start_index {
113 unconsumed_start_index = Some(start_index);
114 }
115 }
116 }
117 },
118 BehaviorForUnmatched::Ignore => (),
119 }
120
121 unmatched_results
122 };
123
124 results.append(&mut unmatched_results);
125 results.append(&mut matched_results);
126
127 start_index = next_start_index;
128 } else {
129 start_index += 1;
130 }
131 }
132 if maximum_matched_end_index < text.len() {
133 match behavior_for_unmatched {
135 BehaviorForUnmatched::KeepAsWords => {
136 results.push(Match::new(
137 TextRange::new(
138 0,
139 text.len() - maximum_matched_end_index,
140 ),
141 None
142 ))
143 },
144 BehaviorForUnmatched::KeepAsChars => {
145 let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
146 .map(|range| {
147 Match::new(
148 TextRange::new(
149 text.len() - (maximum_matched_end_index + range.end_index()),
150 text.len() - (maximum_matched_end_index + range.start_index()),
151 ),
152 None
153 )
154 });
155
156 results.extend(iter);
157 }
158 BehaviorForUnmatched::Ignore => (),
159 }
160 }
161
162 results.reverse();
163
164 results
165}
166
167#[cfg(test)]
168mod tests {
169 use crate::BehaviorForUnmatched;
170 use crate::cedarwood::{
171 segment_backward_longest,
172 BackwardDictionary,
173 };
174
175 #[test]
176 fn test_ignore_unmatched() {
177 let text = " 商品和服务, hello world ";
178 let dict = BackwardDictionary::new(
179 vec!["商品", "和服", "服务", "你好世界"]
180 ).unwrap();
181
182 let result = segment_backward_longest(
183 text,
184 &dict,
185 BehaviorForUnmatched::Ignore
186 );
187
188 assert_eq!(
189 result
190 .into_iter()
191 .map(|x| x.range().extract(text).unwrap())
192 .collect::<Vec<_>>(),
193 vec!["商品", "服务",]
194 );
195 }
196
197 #[test]
198 fn test_keep_unmatched_as_chars() {
199 let text = " 商品和服务, hello world ";
200 let dict = BackwardDictionary::new(
201 vec!["商品", "和服", "服务", "你好世界"]
202 ).unwrap();
203
204 let result = segment_backward_longest(
205 text,
206 &dict,
207 BehaviorForUnmatched::KeepAsChars
208 );
209
210 assert_eq!(
211 result
212 .into_iter()
213 .map(|x| x.range().extract(text).unwrap())
214 .collect::<Vec<_>>(),
215 vec![
216 " ",
217 "商品",
218 "和",
219 "服务",
220 ",",
221 " ",
222 "h",
223 "e",
224 "l",
225 "l",
226 "o",
227 " ",
228 "w",
229 "o",
230 "r",
231 "l",
232 "d",
233 " ",
234 ]
235 );
236 }
237
238 #[test]
239 fn test_keep_unmatched_as_words() {
240 let text = " 商品和服务, hello world ";
241 let dict = BackwardDictionary::new(
242 vec!["商品", "和服", "服务", "你好世界"]
243 ).unwrap();
244
245 let result = segment_backward_longest(
246 text,
247 &dict,
248 BehaviorForUnmatched::KeepAsWords
249 );
250
251 assert_eq!(
252 result
253 .into_iter()
254 .map(|x| x.range().extract(text).unwrap())
255 .collect::<Vec<_>>(),
256 vec![
257 " ",
258 "商品",
259 "和",
260 "服务",
261 ", hello world ",
262 ]
263 );
264 }
265
266 #[test]
267 fn test_value() {
268 let text = " 商品和服务, hello world ";
269 let dict = BackwardDictionary::new(
270 vec![
271 "商品",
272 "和服",
273 "服务",
274 "你好世界",
275 ]
276 ).unwrap();
277
278 let result = segment_backward_longest(
279 text,
280 &dict,
281 BehaviorForUnmatched::Ignore
282 );
283
284 assert_eq!(
285 result
286 .into_iter()
287 .map(|x| x.index_of_patterns().unwrap())
288 .collect::<Vec<_>>(),
289 vec![0, 2]
290 );
291 }
292
293 #[test]
294 fn test_chars_on_edge() {
295 let text = "你好世界";
296 let dict = BackwardDictionary::new(
297 vec!["你好", "世界"]
298 ).unwrap();
299
300 let result = segment_backward_longest(
301 text,
302 &dict,
303 BehaviorForUnmatched::Ignore
304 );
305
306 assert_eq!(
307 result
308 .into_iter()
309 .map(|x| x.range().extract(text).unwrap())
310 .collect::<Vec<_>>(),
311 vec!["你好", "世界"]
312 );
313 }
314}