1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched, UltraNLPError,
6};
7use crate::cedarwood::ForwardDictionary;
8
9pub fn segment_fully<T: AsRef<str>>(
11 text: T,
12 dict: &ForwardDictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text.as_ref().to_lowercase();
16 let mut results: Vec<Match> = vec![];
17
18 let mut unconsumed_word_start_index: Option<usize> = None;
19 let mut unconsumed_char_start_index: Option<usize> = None;
20 let mut maximum_matched_end_index = 0;
21 (0..text.len()).for_each(|start_index| {
22 if text.is_char_boundary(start_index) {
23 let mut matched_results: Vec<Match> = vec![];
24 if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
26 matches
27 .into_iter()
28 .for_each(|(id, length)| {
29 let range = TextRange::new(
30 start_index,
31 start_index + length + 1
32 );
33 let value = usize::try_from(id)
34 .map_err(|err| UltraNLPError::new(err.to_string()))
35 .unwrap();
37
38 let result = Match::new(range, Some(value));
39 matched_results.push(result);
40
41 if range.end_index() > maximum_matched_end_index {
42 maximum_matched_end_index = range.end_index();
43 }
44 });
45 }
46
47 let mut unmatched_results: Vec<Match> = {
48 let mut unmatched_results = vec![];
49
50 match behavior_for_unmatched {
51 BehaviorForUnmatched::KeepAsWords => {
52 if matched_results.len() > 0 {
53 if let Some(index) = unconsumed_word_start_index {
55 let result = Match::new(
56 TextRange::new(index, start_index),
57 None,
58 );
59 unmatched_results.push(result);
60 unconsumed_word_start_index = None;
61 }
62 } else {
63 if start_index >= maximum_matched_end_index {
64 if let None = unconsumed_word_start_index {
65 unconsumed_word_start_index = Some(start_index);
66 }
67 }
68 }
69 },
70 BehaviorForUnmatched::KeepAsChars => {
71 if matched_results.len() > 0{
72 if let Some(index) = unconsumed_char_start_index {
74 let result = Match::new(
75 TextRange::new(index, start_index),
76 None,
77 );
78 results.push(result);
79 unconsumed_char_start_index = None;
80 }
81 } else {
82 if start_index >= maximum_matched_end_index {
83 if let None = unconsumed_char_start_index {
84 unconsumed_char_start_index = Some(start_index);
85 }
86 }
87 }
88 },
89 BehaviorForUnmatched::Ignore => (),
90 }
91
92 unmatched_results
93 };
94
95 results.append(&mut unmatched_results);
96 results.append(&mut matched_results);
97 }
98 });
99 if maximum_matched_end_index < text.len() {
100 match behavior_for_unmatched {
102 BehaviorForUnmatched::KeepAsWords => {
103 results.push(Match::new(
104 TextRange::new(maximum_matched_end_index, text.len()),
105 None
106 ))
107 },
108 BehaviorForUnmatched::KeepAsChars => {
109 let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
110 .map(|range| {
111 Match::new(
112 TextRange::new(
113 maximum_matched_end_index + range.start_index(),
114 maximum_matched_end_index + range.end_index(),
115 ),
116 None
117 )
118 });
119
120 results.extend(iter);
121 }
122 BehaviorForUnmatched::Ignore => (),
123 }
124 }
125
126 results
127}
128
129#[cfg(test)]
130mod tests {
131 use crate::BehaviorForUnmatched;
132 use crate::cedarwood::{
133 segment_fully,
134 ForwardDictionary,
135 };
136
137 #[test]
138 fn test_ignore_unmatched() {
139 let text = " 南京市长江大桥, hello world ";
140 let dict = ForwardDictionary::new(
141 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
142 ).unwrap();
143
144 let result = segment_fully(
145 text,
146 &dict,
147 BehaviorForUnmatched::Ignore
148 );
149
150 assert_eq!(
151 result
152 .into_iter()
153 .map(|x| x.range().extract(text).unwrap())
154 .collect::<Vec<_>>(),
155 vec!["南京", "南京市", "市长", "长江", "大桥"]
156 );
157 }
158
159 #[test]
160 fn test_keep_unmatched_as_chars() {
161 let text = " 南京市长江大桥, hello world ";
162 let dict = ForwardDictionary::new(
163 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
164 ).unwrap();
165
166 let result = segment_fully(
167 text,
168 &dict,
169 BehaviorForUnmatched::KeepAsChars
170 );
171
172 assert_eq!(
173 result
174 .into_iter()
175 .map(|x| x.range().extract(text).unwrap())
176 .collect::<Vec<_>>(),
177 vec![
178 " ",
179 "南京",
180 "南京市",
181 "市长",
182 "长江",
183 "大桥",
184 ",",
185 " ",
186 "h",
187 "e",
188 "l",
189 "l",
190 "o",
191 " ",
192 "w",
193 "o",
194 "r",
195 "l",
196 "d",
197 " ",
198 ]
199 );
200 }
201
202 #[test]
203 fn test_keep_unmatched_as_words() {
204 let text = " 南京市长江大桥, hello world ";
205 let dict = ForwardDictionary::new(
206 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
207 ).unwrap();
208
209 let result = segment_fully(
210 text,
211 &dict,
212 BehaviorForUnmatched::KeepAsWords
213 );
214
215 assert_eq!(
216 result
217 .into_iter()
218 .map(|x| x.range().extract(text).unwrap())
219 .collect::<Vec<_>>(),
220 vec![
221 " ",
222 "南京",
223 "南京市",
224 "市长",
225 "长江",
226 "大桥",
227 ", hello world ",
228 ]
229 );
230 }
231
232 #[test]
233 fn test_value() {
234 let text = " 南京市长江大桥, hello world ";
235 let dict = ForwardDictionary::new(
236 vec![
237 "南京",
238 "南京市",
239 "市长",
240 "长江",
241 "大桥",
242 "你好世界",
243 ]
244 ).unwrap();
245
246 let result = segment_fully(
247 text,
248 &dict,
249 BehaviorForUnmatched::Ignore
250 );
251
252 assert_eq!(
253 result
254 .into_iter()
255 .map(|x| x.index_of_patterns().unwrap())
256 .collect::<Vec<_>>(),
257 vec![
258 0,
259 1,
260 2,
261 3,
262 4,
263 ]
264 );
265 }
266
267 #[test]
268 fn test_chars_on_edge() {
269 let text = "你好世界";
270 let dict = ForwardDictionary::new(
271 vec!["你好", "世界"]
272 ).unwrap();
273
274 let result = segment_fully(
275 text,
276 &dict,
277 BehaviorForUnmatched::Ignore
278 );
279
280 assert_eq!(
281 result
282 .into_iter()
283 .map(|x| x.range().extract(text).unwrap())
284 .collect::<Vec<_>>(),
285 vec!["你好", "世界"]
286 );
287 }
288}