1use crate::{
2 Match,
3 TextRange,
4 BehaviorForUnmatched,
5 utils::split_as_char_ranges,
6};
7use crate::daachorse::BackwardDictionary;
8
9pub fn segment_backward_longest<T: AsRef<str>>(
11 text: T,
12 dict: &BackwardDictionary,
13 behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15 let text = text
16 .as_ref()
17 .to_lowercase()
18 .chars()
19 .rev()
20 .collect::<String>();
21
22 let mut results: Vec<Match> = vec![];
23
24 let mut start_index = 0;
25 while start_index < text.len() {
26 if text.is_char_boundary(start_index) {
27 let mut iter = dict.acdat.leftmost_find_iter(&text[start_index..]);
28
29 match iter.next() {
30 Some(mat) => {
31 let real_mat_start_index = start_index + mat.start();
32 let real_mat_end_index = start_index + mat.end();
33 let result = Match::new(
34 TextRange::new(
35 text.len() - real_mat_end_index,
36 text.len() - real_mat_start_index,
37 ),
38 Some(mat.value())
39 );
40
41 if mat.start() > 0 {
42 match behavior_for_unmatched {
44 BehaviorForUnmatched::Ignore => {},
45 BehaviorForUnmatched::KeepAsWords => {
46 let result = Match::new(
47 TextRange::new(
48 text.len() - (start_index + mat.start()),
49 text.len() - start_index,
50 ),
51 None,
52 );
53 results.push(result);
54 },
55 BehaviorForUnmatched::KeepAsChars => {
56 let iter = split_as_char_ranges(&text[start_index..start_index + mat.start()])
57 .map(|range| {
58 Match::new(
59 TextRange::new(
60 text.len() - (start_index + range.end_index()),
61 text.len() - (start_index + range.start_index()),
62 ),
63 None,
64 )
65 });
66
67 results.extend(iter);
68 },
69 }
70 }
71
72 start_index = real_mat_end_index;
73
74 results.push(result);
75 }
76 None => {
77 match behavior_for_unmatched {
79 BehaviorForUnmatched::Ignore => {},
80 BehaviorForUnmatched::KeepAsWords => {
81 results.push(
82 Match::new(
83 TextRange::new(
84 0,
85 text.len() - start_index
86 ),
87 None,
88 )
89 );
90 },
91 BehaviorForUnmatched::KeepAsChars => {
92 let iter = split_as_char_ranges(&text[start_index..])
93 .map(|range| {
94 Match::new(
95 TextRange::new(
96 text.len() - (start_index + range.end_index()),
97 text.len() - (start_index + range.start_index()),
98 ),
99 None,
100 )
101 });
102
103 results.extend(iter);
104 },
105 }
106
107 start_index += 1;
108 }
109 }
110 } else {
111 start_index += 1;
112 }
113 }
114
115 results.reverse();
116
117 results
118}
119
120#[cfg(test)]
121mod tests {
122 use crate::BehaviorForUnmatched;
123 use crate::daachorse::{
124 segment_backward_longest,
125 BackwardDictionary,
126 };
127
128 #[test]
129 fn test_ignore_unmatched() {
130 let text = " 商品和服务, hello world ";
131 let dict = BackwardDictionary::new(
132 vec!["商品", "和服", "服务", "你好世界"]
133 ).unwrap();
134
135 let result = segment_backward_longest(
136 text,
137 &dict,
138 BehaviorForUnmatched::Ignore
139 );
140
141 assert_eq!(
142 result
143 .into_iter()
144 .map(|x| x.range().extract(text).unwrap())
145 .collect::<Vec<_>>(),
146 vec!["商品", "服务",]
147 );
148 }
149
150 #[test]
151 fn test_keep_unmatched_as_chars() {
152 let text = " 商品和服务, hello world ";
153 let dict = BackwardDictionary::new(
154 vec!["商品", "和服", "服务", "你好世界"]
155 ).unwrap();
156
157 let result = segment_backward_longest(
158 text,
159 &dict,
160 BehaviorForUnmatched::KeepAsChars
161 );
162
163 assert_eq!(
164 result
165 .into_iter()
166 .map(|x| x.range().extract(text).unwrap())
167 .collect::<Vec<_>>(),
168 vec![
169 " ",
170 "商品",
171 "和",
172 "服务",
173 ",",
174 " ",
175 "h",
176 "e",
177 "l",
178 "l",
179 "o",
180 " ",
181 "w",
182 "o",
183 "r",
184 "l",
185 "d",
186 " ",
187 ]
188 );
189 }
190
191 #[test]
192 fn test_keep_unmatched_as_words() {
193 let text = " 商品和服务, hello world ";
194 let dict = BackwardDictionary::new(
195 vec!["商品", "和服", "服务", "你好世界"]
196 ).unwrap();
197
198 let result = segment_backward_longest(
199 text,
200 &dict,
201 BehaviorForUnmatched::KeepAsWords
202 );
203
204 assert_eq!(
205 result
206 .into_iter()
207 .map(|x| x.range().extract(text).unwrap())
208 .collect::<Vec<_>>(),
209 vec![
210 " ",
211 "商品",
212 "和",
213 "服务",
214 ", hello world ",
215 ]
216 );
217 }
218
219 #[test]
220 fn test_value() {
221 let text = " 商品和服务, hello world ";
222 let dict = BackwardDictionary::new(
223 vec![
224 "商品",
225 "和服",
226 "服务",
227 "你好世界",
228 ]
229 ).unwrap();
230
231 let result = segment_backward_longest(
232 text,
233 &dict,
234 BehaviorForUnmatched::Ignore
235 );
236
237 assert_eq!(
238 result
239 .into_iter()
240 .map(|x| x.index_of_patterns().unwrap())
241 .collect::<Vec<_>>(),
242 vec![0, 2]
243 );
244 }
245
246 #[test]
247 fn test_chars_on_edge() {
248 let text = "你好世界";
249 let dict = BackwardDictionary::new(
250 vec!["你好", "世界"]
251 ).unwrap();
252
253 let result = segment_backward_longest(
254 text,
255 &dict,
256 BehaviorForUnmatched::Ignore
257 );
258
259 assert_eq!(
260 result
261 .into_iter()
262 .map(|x| x.range().extract(text).unwrap())
263 .collect::<Vec<_>>(),
264 vec!["你好", "世界"]
265 );
266 }
267}