ultra_nlp/_cedarwood/
segment_backward_longest.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched,
6    UltraNLPError,
7};
8use crate::cedarwood::BackwardDictionary;
9
10// 待generator稳定, 改为generator, 以便返回Iterator.
11pub fn segment_backward_longest<T: AsRef<str>>(
12    text: T,
13    dict: &BackwardDictionary,
14    behavior_for_unmatched: BehaviorForUnmatched,
15) -> Vec<Match> {
16    let text = text
17        .as_ref()
18        .to_lowercase()
19        .chars()
20        .rev()
21        .collect::<String>();
22
23    let mut results: Vec<Match> = vec![];
24
25    let mut unconsumed_start_index: Option<usize> = None;
26    let mut maximum_matched_end_index = 0;
27    let mut start_index = 0;
28    while start_index < text.len() {
29        if text.is_char_boundary(start_index) {
30            let mut next_start_index = start_index + 1;
31
32            let mut matched_results: Vec<Match> = vec![];
33            // 注意, 虽然不知道这个Option的意义, 但Option是Some不代表matches非空.
34            if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
35                let longest_match: Option<(i32, usize)> = matches
36                    .into_iter()
37                    .reduce(| longest, current | {
38                        let (_, longest_length) = longest;
39                        let (_, current_length) = current;
40                        if current_length > longest_length {
41                            current
42                        } else {
43                            longest
44                        }
45                    });
46
47                if let Some((id, length)) = longest_match {
48                    let end_index = start_index + length + 1;
49                    let range = TextRange::new(
50                        text.len() - end_index,
51                        text.len() - start_index,
52                    );
53                    let value = usize::try_from(id)
54                        .map_err(|err| UltraNLPError::new(err.to_string()))
55                        // 没有使用负数值, 且usize的最大值大于i32, 转换应当总是能成功
56                        .unwrap();
57
58                    let result = Match::new(range, Some(value));
59                    matched_results.push(result);
60
61                    next_start_index = start_index + length + 1;
62                    maximum_matched_end_index = start_index + length + 1;
63                }
64            }
65
66            let mut unmatched_results: Vec<Match> = {
67                let mut unmatched_results: Vec<Match> = vec![];
68
69                match behavior_for_unmatched {
70                    BehaviorForUnmatched::KeepAsWords => {
71                        if matched_results.len() > 0 {
72                            // 将之前未消耗的word作为Match提交
73                            if let Some(index) = unconsumed_start_index {
74                                let result = Match::new(
75                                    TextRange::new(
76                                        text.len() - start_index,
77                                        text.len() - index,
78                                    ),
79                                    None,
80                                );
81                                unmatched_results.push(result);
82                                unconsumed_start_index = None;
83                            }
84                        } else {
85                            if start_index >= maximum_matched_end_index {
86                                if let None = unconsumed_start_index {
87                                    unconsumed_start_index = Some(start_index);
88                                }
89                            }
90                        }
91                    },
92                    BehaviorForUnmatched::KeepAsChars => {
93                        if matched_results.len() > 0 {
94                            // 将之前未消耗的char作为Match提交
95                            if let Some(index) = unconsumed_start_index {
96                                let iter = split_as_char_ranges(&text[index..start_index])
97                                    .map(|range| {
98                                        Match::new(
99                                            TextRange::new(
100                                                text.len() - (index + range.end_index()),
101                                                text.len() - (index + range.start_index()),
102                                            ),
103                                            None,
104                                        )
105                                    });
106
107                                unmatched_results.extend(iter);
108                                unconsumed_start_index = None;
109                            }
110                        } else {
111                            if start_index >= maximum_matched_end_index {
112                                if let None = unconsumed_start_index {
113                                    unconsumed_start_index = Some(start_index);
114                                }
115                            }
116                        }
117                    },
118                    BehaviorForUnmatched::Ignore => (),
119                }
120
121                unmatched_results
122            };
123
124            results.append(&mut unmatched_results);
125            results.append(&mut matched_results);
126
127            start_index = next_start_index;
128        } else {
129            start_index += 1;
130        }
131    }
132    if maximum_matched_end_index < text.len() {
133        // 处理text剩余的文本
134        match behavior_for_unmatched {
135            BehaviorForUnmatched::KeepAsWords => {
136                results.push(Match::new(
137                    TextRange::new(
138                        0,
139                        text.len() - maximum_matched_end_index,
140                    ),
141                    None
142                ))
143            },
144            BehaviorForUnmatched::KeepAsChars => {
145                let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
146                    .map(|range| {
147                        Match::new(
148                            TextRange::new(
149                                text.len() - (maximum_matched_end_index + range.end_index()),
150                                text.len() - (maximum_matched_end_index + range.start_index()),
151                            ),
152                            None
153                        )
154                    });
155
156                results.extend(iter);
157            }
158            BehaviorForUnmatched::Ignore => (),
159        }
160    }
161
162    results.reverse();
163
164    results
165}
166
167#[cfg(test)]
168mod tests {
169    use crate::BehaviorForUnmatched;
170    use crate::cedarwood::{
171        segment_backward_longest,
172        BackwardDictionary,
173    };
174
175    #[test]
176    fn test_ignore_unmatched() {
177        let text = " 商品和服务, hello world ";
178        let dict = BackwardDictionary::new(
179            vec!["商品", "和服", "服务", "你好世界"]
180        ).unwrap();
181
182        let result = segment_backward_longest(
183            text,
184            &dict,
185            BehaviorForUnmatched::Ignore
186        );
187
188        assert_eq!(
189            result
190                .into_iter()
191                .map(|x| x.range().extract(text).unwrap())
192                .collect::<Vec<_>>(),
193            vec!["商品", "服务",]
194        );
195    }
196
197    #[test]
198    fn test_keep_unmatched_as_chars() {
199        let text = " 商品和服务, hello world ";
200        let dict = BackwardDictionary::new(
201            vec!["商品", "和服", "服务", "你好世界"]
202        ).unwrap();
203
204        let result = segment_backward_longest(
205            text,
206            &dict,
207            BehaviorForUnmatched::KeepAsChars
208        );
209
210        assert_eq!(
211            result
212                .into_iter()
213                .map(|x| x.range().extract(text).unwrap())
214                .collect::<Vec<_>>(),
215            vec![
216                " ",
217                "商品",
218                "和",
219                "服务",
220                ",",
221                " ",
222                "h",
223                "e",
224                "l",
225                "l",
226                "o",
227                " ",
228                "w",
229                "o",
230                "r",
231                "l",
232                "d",
233                " ",
234            ]
235        );
236    }
237
238    #[test]
239    fn test_keep_unmatched_as_words() {
240        let text = " 商品和服务, hello world ";
241        let dict = BackwardDictionary::new(
242            vec!["商品", "和服", "服务", "你好世界"]
243        ).unwrap();
244
245        let result = segment_backward_longest(
246            text,
247            &dict,
248            BehaviorForUnmatched::KeepAsWords
249        );
250
251        assert_eq!(
252            result
253                .into_iter()
254                .map(|x| x.range().extract(text).unwrap())
255                .collect::<Vec<_>>(),
256            vec![
257                " ",
258                "商品",
259                "和",
260                "服务",
261                ", hello world ",
262            ]
263        );
264    }
265
266    #[test]
267    fn test_value() {
268        let text = " 商品和服务, hello world ";
269        let dict = BackwardDictionary::new(
270            vec![
271                "商品",
272                "和服",
273                "服务",
274                "你好世界",
275            ]
276        ).unwrap();
277
278        let result = segment_backward_longest(
279            text,
280            &dict,
281            BehaviorForUnmatched::Ignore
282        );
283
284        assert_eq!(
285            result
286                .into_iter()
287                .map(|x| x.index_of_patterns().unwrap())
288                .collect::<Vec<_>>(),
289            vec![0, 2]
290        );
291    }
292
293    #[test]
294    fn test_chars_on_edge() {
295        let text = "你好世界";
296        let dict = BackwardDictionary::new(
297            vec!["你好", "世界"]
298        ).unwrap();
299
300        let result = segment_backward_longest(
301            text,
302            &dict,
303            BehaviorForUnmatched::Ignore
304        );
305
306        assert_eq!(
307            result
308                .into_iter()
309                .map(|x| x.range().extract(text).unwrap())
310                .collect::<Vec<_>>(),
311            vec!["你好", "世界"]
312        );
313    }
314}