1use crate::{
2 Match,
3 TextRange,
4 BehaviorForUnmatched,
5 utils::split_as_char_ranges,
6};
7use crate::daachorse::ForwardDictionary;
8
9pub fn segment_forward_longest<T: AsRef<str>>(
11 text: T,
12 dict: &ForwardDictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text.as_ref().to_lowercase();
16 let mut results: Vec<Match> = vec![];
17
18 let mut start_index = 0;
19 while start_index < text.len() {
20 if text.is_char_boundary(start_index) {
21 let mut iter = dict.acdat.leftmost_find_iter(&text[start_index..]);
22
23 match iter.next() {
24 Some(mat) => {
25 let real_mat_start_index = start_index + mat.start();
26 let real_mat_end_index = start_index + mat.end();
27 let result = Match::new(
28 TextRange::new(real_mat_start_index, real_mat_end_index),
29 Some(mat.value())
30 );
31
32 if mat.start() > 0 {
33 match behavior_for_unmatched {
35 BehaviorForUnmatched::Ignore => {},
36 BehaviorForUnmatched::KeepAsWords => {
37 results.push(
38 Match::new(
39 TextRange::new(
40 start_index, start_index + mat.start()
41 ),
42 None
43 )
44 );
45 },
46 BehaviorForUnmatched::KeepAsChars => {
47 let iter = split_as_char_ranges(
48 &text[start_index..start_index + mat.start()]
49 )
50 .map(|range| {
51 Match::new(
52 TextRange::new(
53 start_index + range.start_index(),
54 start_index + range.end_index(),
55 ),
56 None,
57 )
58 });
59
60 results.extend(iter);
61 },
62 }
63 }
64
65 start_index = real_mat_end_index;
66
67 results.push(result);
68 }
69 None => {
70 match behavior_for_unmatched {
72 BehaviorForUnmatched::Ignore => {},
73 BehaviorForUnmatched::KeepAsWords => {
74 results.push(
75 Match::new(
76 TextRange::new(
77 start_index,
78 text.len()
79 ),
80 None,
81 )
82 );
83 },
84 BehaviorForUnmatched::KeepAsChars => {
85 let iter = split_as_char_ranges(&text[start_index..])
86 .map(|range| {
87 Match::new(
88 TextRange::new(
89 start_index + range.start_index(),
90 start_index + range.end_index(),
91 ),
92 None,
93 )
94 });
95
96 results.extend(iter);
97 },
98 }
99
100 break;
101 }
102 }
103 } else {
104 start_index += 1;
105 }
106 }
107
108 results
109}
110
111#[cfg(test)]
112mod tests {
113 use crate::BehaviorForUnmatched;
114 use crate::daachorse::{
115 segment_forward_longest,
116 ForwardDictionary,
117 };
118
119 #[test]
120 fn test_ignore_unmatched() {
121 let text = " 商品和服务, hello world ";
122 let dict = ForwardDictionary::new(
123 vec!["商品", "和服", "服务", "你好世界"]
124 ).unwrap();
125
126 let result = segment_forward_longest(
127 text,
128 &dict,
129 BehaviorForUnmatched::Ignore
130 );
131
132 assert_eq!(
133 result
134 .into_iter()
135 .map(|x| x.range().extract(text).unwrap())
136 .collect::<Vec<_>>(),
137 vec!["商品", "和服",]
138 );
139 }
140
141 #[test]
142 fn test_keep_unmatched_as_chars() {
143 let text = " 商品和服务, hello world ";
144 let dict = ForwardDictionary::new(
145 vec!["商品", "和服", "服务", "你好世界"]
146 ).unwrap();
147
148 let result = segment_forward_longest(
149 text,
150 &dict,
151 BehaviorForUnmatched::KeepAsChars
152 );
153
154 assert_eq!(
155 result
156 .into_iter()
157 .map(|x| x.range().extract(text).unwrap())
158 .collect::<Vec<_>>(),
159 vec![
160 " ",
161 "商品",
162 "和服",
163 "务",
164 ",",
165 " ",
166 "h",
167 "e",
168 "l",
169 "l",
170 "o",
171 " ",
172 "w",
173 "o",
174 "r",
175 "l",
176 "d",
177 " ",
178 ],
179 );
180 }
181
182 #[test]
183 fn test_keep_unmatched_as_words() {
184 let text = " 商品和服务, hello world ";
185 let dict = ForwardDictionary::new(
186 vec!["商品", "和服", "服务", "你好世界"]
187 ).unwrap();
188
189 let result = segment_forward_longest(
190 text,
191 &dict,
192 BehaviorForUnmatched::KeepAsWords
193 );
194
195 assert_eq!(
196 result
197 .into_iter()
198 .map(|x| x.range().extract(text).unwrap())
199 .collect::<Vec<_>>(),
200 vec![
201 " ",
202 "商品",
203 "和服",
204 "务, hello world ",
205 ],
206 );
207 }
208
209 #[test]
210 fn test_value() {
211 let text = " 商品和服务, hello world ";
212 let dict = ForwardDictionary::new(
213 vec![
214 "商品",
215 "和服",
216 "服务",
217 "你好世界",
218 ]
219 ).unwrap();
220
221 let result = segment_forward_longest(
222 text,
223 &dict,
224 BehaviorForUnmatched::Ignore
225 );
226
227 assert_eq!(
228 result
229 .into_iter()
230 .map(|x| x.index_of_patterns().unwrap())
231 .collect::<Vec<_>>(),
232 vec![0, 1]
233 );
234 }
235
236 #[test]
237 fn test_chars_on_edge() {
238 let text = "你好世界";
239 let dict = ForwardDictionary::new(
240 vec!["你好", "世界"]
241 ).unwrap();
242
243 let result = segment_forward_longest(
244 text,
245 &dict,
246 BehaviorForUnmatched::Ignore
247 );
248
249 assert_eq!(
250 result
251 .into_iter()
252 .map(|x| x.range().extract(text).unwrap())
253 .collect::<Vec<_>>(),
254 vec!["你好", "世界"]
255 );
256 }
257}