1use crate::utils::split_as_char_ranges;
2use crate::{
3 Match,
4 TextRange,
5 BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9pub fn segment_fully<T: AsRef<str>>(
11 text: T,
12 dict: &Dictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text
16 .as_ref()
17 .to_lowercase();
18 let mut results: Vec<Match> = vec![];
19
20 let mut unconsumed_word_start_index: Option<usize> = None;
21 let mut unconsumed_char_start_index: Option<usize> = None;
22 let mut maximum_matched_end_index = 0;
23 (0..text.len()).into_iter().for_each(|start_index| {
24 if text.is_char_boundary(start_index) {
25 let mut matched_results: Vec<Match> = vec![];
26 ((start_index + 1)..=text.len())
27 .into_iter()
28 .for_each(|end_index| {
29 if text.is_char_boundary(end_index) {
30 let sub_text = &text[start_index..end_index];
31
32 if let Some(value) = dict.map.get(sub_text) {
33 let range = TextRange::new(
34 start_index,
35 end_index
36 );
37
38 let result = Match::new(range, Some(*value));
39 matched_results.push(result);
40
41 if range.end_index() > maximum_matched_end_index {
42 maximum_matched_end_index = range.end_index();
43 }
44 }
45 }
46 });
47
48 let mut unmatched_results: Vec<Match> = {
49 let mut unmatched_results = vec![];
50
51 match behavior_for_unmatched {
52 BehaviorForUnmatched::KeepAsWords => {
53 if matched_results.len() > 0 {
54 if let Some(index) = unconsumed_word_start_index {
56 let result = Match::new(
57 TextRange::new(index, start_index),
58 None,
59 );
60 unmatched_results.push(result);
61 unconsumed_word_start_index = None;
62 }
63 } else {
64 if start_index >= maximum_matched_end_index {
65 if let None = unconsumed_word_start_index {
66 unconsumed_word_start_index = Some(start_index);
67 }
68 }
69 }
70 },
71 BehaviorForUnmatched::KeepAsChars => {
72 if matched_results.len() > 0{
73 if let Some(index) = unconsumed_char_start_index {
75 let result = Match::new(
76 TextRange::new(index, start_index),
77 None,
78 );
79 results.push(result);
80 unconsumed_char_start_index = None;
81 }
82 } else {
83 if start_index >= maximum_matched_end_index {
84 if let None = unconsumed_char_start_index {
85 unconsumed_char_start_index = Some(start_index);
86 }
87 }
88 }
89 },
90 BehaviorForUnmatched::Ignore => (),
91 }
92
93 unmatched_results
94 };
95
96 results.append(&mut unmatched_results);
97 results.append(&mut matched_results);
98 }
99 });
100 if maximum_matched_end_index < text.len() {
101 match behavior_for_unmatched {
103 BehaviorForUnmatched::KeepAsWords => {
104 results.push(Match::new(
105 TextRange::new(maximum_matched_end_index, text.len()),
106 None
107 ))
108 },
109 BehaviorForUnmatched::KeepAsChars => {
110 let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
111 .map(|range| {
112 Match::new(
113 TextRange::new(
114 maximum_matched_end_index + range.start_index(),
115 maximum_matched_end_index + range.end_index(),
116 ),
117 None
118 )
119 });
120
121 results.extend(iter);
122 }
123 BehaviorForUnmatched::Ignore => (),
124 }
125 }
126
127 results
128}
129
130#[cfg(test)]
131mod tests {
132 use crate::BehaviorForUnmatched;
133 use crate::hashmap::{
134 segment_fully,
135 Dictionary,
136 };
137
138 #[test]
139 fn test_ignore_unmatched() {
140 let text = " 南京市长江大桥, hello world ";
141 let dict = Dictionary::new(
142 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
143 ).unwrap();
144
145 let result = segment_fully(
146 text,
147 &dict,
148 BehaviorForUnmatched::Ignore
149 );
150
151 assert_eq!(
152 result
153 .into_iter()
154 .map(|x| x.range().extract(text).unwrap())
155 .collect::<Vec<_>>(),
156 vec!["南京", "南京市", "市长", "长江", "大桥"]
157 );
158 }
159
160 #[test]
161 fn test_keep_unmatched_as_chars() {
162 let text = " 南京市长江大桥, hello world ";
163 let dict = Dictionary::new(
164 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
165 ).unwrap();
166
167 let result = segment_fully(
168 text,
169 &dict,
170 BehaviorForUnmatched::KeepAsChars
171 );
172
173 assert_eq!(
174 result
175 .into_iter()
176 .map(|x| x.range().extract(text).unwrap())
177 .collect::<Vec<_>>(),
178 vec![
179 " ",
180 "南京",
181 "南京市",
182 "市长",
183 "长江",
184 "大桥",
185 ",",
186 " ",
187 "h",
188 "e",
189 "l",
190 "l",
191 "o",
192 " ",
193 "w",
194 "o",
195 "r",
196 "l",
197 "d",
198 " ",
199 ]
200 );
201 }
202
203 #[test]
204 fn test_keep_unmatched_as_words() {
205 let text = " 南京市长江大桥, hello world ";
206 let dict = Dictionary::new(
207 vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
208 ).unwrap();
209
210 let result = segment_fully(
211 text,
212 &dict,
213 BehaviorForUnmatched::KeepAsWords
214 );
215
216 assert_eq!(
217 result
218 .into_iter()
219 .map(|x| x.range().extract(text).unwrap())
220 .collect::<Vec<_>>(),
221 vec![
222 " ",
223 "南京",
224 "南京市",
225 "市长",
226 "长江",
227 "大桥",
228 ", hello world ",
229 ]
230 );
231 }
232
233 #[test]
234 fn test_value() {
235 let text = " 南京市长江大桥, hello world ";
236 let dict = Dictionary::new(
237 vec![
238 "南京",
239 "南京市",
240 "市长",
241 "长江",
242 "大桥",
243 "你好世界",
244 ]
245 ).unwrap();
246
247 let result = segment_fully(
248 text,
249 &dict,
250 BehaviorForUnmatched::Ignore
251 );
252
253 assert_eq!(
254 result
255 .into_iter()
256 .map(|x| x.index_of_patterns().unwrap())
257 .collect::<Vec<_>>(),
258 vec![
259 0,
260 1,
261 2,
262 3,
263 4,
264 ]
265 );
266 }
267
268 #[test]
269 fn test_chars_on_edge() {
270 let text = "你好世界";
271 let dict = Dictionary::new(
272 vec!["你好", "世界"]
273 ).unwrap();
274
275 let result = segment_fully(
276 text,
277 &dict,
278 BehaviorForUnmatched::Ignore
279 );
280
281 assert_eq!(
282 result
283 .into_iter()
284 .map(|x| x.range().extract(text).unwrap())
285 .collect::<Vec<_>>(),
286 vec!["你好", "世界"]
287 );
288 }
289}