1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched,
6 UltraNLPError,
7};
8use crate::cedarwood::ForwardDictionary;
9
10pub fn segment_forward_longest<T: AsRef<str>>(
12 text: T,
13 dict: &ForwardDictionary,
14 behavior_for_unmatched: BehaviorForUnmatched,
15) -> Vec<Match> {
16 let text = text
17 .as_ref()
18 .to_lowercase();
19
20 let mut results: Vec<Match> = vec![];
21
22 let mut unconsumed_start_index: Option<usize> = None;
23 let mut maximum_matched_end_index = 0;
24 let mut start_index = 0;
25 while start_index < text.len() {
26 if text.is_char_boundary(start_index) {
27 let mut next_start_index = start_index + 1;
28
29 let mut matched_results: Vec<Match> = vec![];
30 if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
32 let longest_match: Option<(i32, usize)> = matches
33 .into_iter()
34 .reduce(| longest, current | {
35 let (_, longest_length) = longest;
36 let (_, current_length) = current;
37 if current_length > longest_length {
38 current
39 } else {
40 longest
41 }
42 });
43
44 if let Some((id, length)) = longest_match{
45 let range = TextRange::new(
46 start_index,
47 start_index + length + 1
48 );
49 let value = usize::try_from(id)
50 .map_err(|err| UltraNLPError::new(err.to_string()))
51 .unwrap();
53
54 let result = Match::new(range, Some(value));
55 matched_results.push(result);
56
57 next_start_index = range.end_index();
58 maximum_matched_end_index = range.end_index();
59 }
60 }
61
62 let mut unmatched_results: Vec<Match> = {
63 let mut unmatched_results = vec![];
64
65 match behavior_for_unmatched {
66 BehaviorForUnmatched::KeepAsWords => {
67 if matched_results.len() > 0 {
68 if let Some(index) = unconsumed_start_index {
70 let result = Match::new(
71 TextRange::new(index, start_index),
72 None,
73 );
74 unmatched_results.push(result);
75 unconsumed_start_index = None;
76 }
77 } else {
78 if start_index >= maximum_matched_end_index {
79 if let None = unconsumed_start_index {
80 unconsumed_start_index = Some(start_index);
81 }
82 }
83 }
84 },
85 BehaviorForUnmatched::KeepAsChars => {
86 if matched_results.len() > 0 {
87 if let Some(index) = unconsumed_start_index {
89 let result = Match::new(
90 TextRange::new(index, start_index),
91 None,
92 );
93 unmatched_results.push(result);
94 unconsumed_start_index = None;
95 }
96 } else {
97 if start_index >= maximum_matched_end_index {
98 if let None = unconsumed_start_index {
99 unconsumed_start_index = Some(start_index);
100 }
101 }
102 }
103 },
104 BehaviorForUnmatched::Ignore => (),
105 }
106
107 unmatched_results
108 };
109
110 results.append(&mut unmatched_results);
111 results.append(&mut matched_results);
112
113 start_index = next_start_index;
114 } else {
115 start_index += 1;
116 }
117 }
118 if maximum_matched_end_index < text.len() {
119 match behavior_for_unmatched {
121 BehaviorForUnmatched::KeepAsWords => {
122 results.push(Match::new(
123 TextRange::new(maximum_matched_end_index, text.len()),
124 None
125 ))
126 },
127 BehaviorForUnmatched::KeepAsChars => {
128 let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
129 .map(|range| {
130 Match::new(
131 TextRange::new(
132 maximum_matched_end_index + range.start_index(),
133 maximum_matched_end_index + range.end_index(),
134 ),
135 None,
136 )
137 });
138
139 results.extend(iter);
140 }
141 BehaviorForUnmatched::Ignore => (),
142 }
143 }
144
145 results
146}
147
148#[cfg(test)]
149mod tests {
150 use crate::BehaviorForUnmatched;
151 use crate::cedarwood::{
152 segment_forward_longest,
153 ForwardDictionary,
154 };
155
156 #[test]
157 fn test_ignore_unmatched() {
158 let text = " 商品和服务, hello world ";
159 let dict = ForwardDictionary::new(
160 vec!["商品", "和服", "服务", "你好世界"]
161 ).unwrap();
162
163 let result = segment_forward_longest(
164 text,
165 &dict,
166 BehaviorForUnmatched::Ignore
167 );
168
169 assert_eq!(
170 result
171 .into_iter()
172 .map(|x| x.range().extract(text).unwrap())
173 .collect::<Vec<_>>(),
174 vec!["商品", "和服",]
175 );
176 }
177
178 #[test]
179 fn test_keep_unmatched_as_chars() {
180 let text = " 商品和服务, hello world ";
181 let dict = ForwardDictionary::new(
182 vec!["商品", "和服", "服务", "你好世界"]
183 ).unwrap();
184
185 let result = segment_forward_longest(
186 text,
187 &dict,
188 BehaviorForUnmatched::KeepAsChars
189 );
190
191 assert_eq!(
192 result
193 .into_iter()
194 .map(|x| x.range().extract(text).unwrap())
195 .collect::<Vec<_>>(),
196 vec![
197 " ",
198 "商品",
199 "和服",
200 "务",
201 ",",
202 " ",
203 "h",
204 "e",
205 "l",
206 "l",
207 "o",
208 " ",
209 "w",
210 "o",
211 "r",
212 "l",
213 "d",
214 " ",
215 ],
216 );
217 }
218
219 #[test]
220 fn test_keep_unmatched_as_words() {
221 let text = " 商品和服务, hello world ";
222 let dict = ForwardDictionary::new(
223 vec!["商品", "和服", "服务", "你好世界"]
224 ).unwrap();
225
226 let result = segment_forward_longest(
227 text,
228 &dict,
229 BehaviorForUnmatched::KeepAsWords
230 );
231
232 assert_eq!(
233 result
234 .into_iter()
235 .map(|x| x.range().extract(text).unwrap())
236 .collect::<Vec<_>>(),
237 vec![
238 " ",
239 "商品",
240 "和服",
241 "务, hello world ",
242 ],
243 );
244 }
245
246 #[test]
247 fn test_value() {
248 let text = " 商品和服务, hello world ";
249 let dict = ForwardDictionary::new(
250 vec![
251 "商品",
252 "和服",
253 "服务",
254 "你好世界",
255 ]
256 ).unwrap();
257
258 let result = segment_forward_longest(
259 text,
260 &dict,
261 BehaviorForUnmatched::Ignore
262 );
263
264 assert_eq!(
265 result
266 .into_iter()
267 .map(|x| x.index_of_patterns().unwrap())
268 .collect::<Vec<_>>(),
269 vec![0, 1]
270 );
271 }
272
273 #[test]
274 fn test_chars_on_edge() {
275 let text = "你好世界";
276 let dict = ForwardDictionary::new(
277 vec!["你好", "世界"]
278 ).unwrap();
279
280 let result = segment_forward_longest(
281 text,
282 &dict,
283 BehaviorForUnmatched::Ignore
284 );
285
286 assert_eq!(
287 result
288 .into_iter()
289 .map(|x| x.range().extract(text).unwrap())
290 .collect::<Vec<_>>(),
291 vec!["你好", "世界"]
292 );
293 }
294}