ultra_nlp/_hashmap/
segment_backward_longest.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_backward_longest<T: AsRef<str>>(
11    text: T,
12    dict: &Dictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text
16        .as_ref()
17        .to_lowercase();
18
19    let mut results: Vec<Match> = vec![];
20
21    let mut unconsumed_end_index: Option<usize> = None;
22    let mut minimum_matched_start_index = text.len();
23    let mut end_index = text.len();
24    while end_index > 0 {
25        if text.is_char_boundary(end_index) {
26            let mut next_end_index = end_index - 1;
27
28            let mut matched_results: Vec<Match> = vec![];
29            let mut longest_match: Option<(
30                usize, // start_index
31                usize, // value
32            )> = None;
33            (0..end_index)
34                .rev()
35                .into_iter()
36                .for_each(|start_index| {
37                    if text.is_char_boundary(start_index) {
38                        let sub_text = &text[start_index..end_index];
39
40                        if let Some(value) = dict.map.get(sub_text) {
41                            longest_match = Some((start_index, *value))
42                        }
43                    }
44                });
45
46            if let Some((start_index, value)) = longest_match {
47                let range = TextRange::new(
48                    start_index,
49                    end_index,
50                );
51
52                let result = Match::new(range, Some(value));
53                matched_results.push(result);
54
55                next_end_index = start_index;
56                minimum_matched_start_index = start_index;
57            }
58
59            let mut unmatched_results: Vec<Match> = {
60                let mut unmatched_results: Vec<Match> = vec![];
61
62                match behavior_for_unmatched {
63                    BehaviorForUnmatched::KeepAsWords => {
64                        if matched_results.len() > 0 {
65                            // 将之前未消耗的word作为Match提交
66                            if let Some(index) = unconsumed_end_index {
67                                let result = Match::new(
68                                    TextRange::new(
69                                        end_index,
70                                        index,
71                                    ),
72                                    None,
73                                );
74                                unmatched_results.push(result);
75                                unconsumed_end_index = None;
76                            }
77                        } else {
78                            if end_index <= minimum_matched_start_index {
79                                if let None = unconsumed_end_index {
80                                    unconsumed_end_index = Some(end_index);
81                                }
82                            }
83                        }
84                    },
85                    BehaviorForUnmatched::KeepAsChars => {
86                        if matched_results.len() > 0 {
87                            // 将之前未消耗的char作为Match提交
88                            if let Some(index) = unconsumed_end_index {
89                                let iter = split_as_char_ranges(&text[end_index..index])
90                                    .map(|range| {
91                                        Match::new(
92                                            TextRange::new(
93                                                end_index + range.start_index(),
94                                                end_index + range.end_index(),
95                                            ),
96                                            None,
97                                        )
98                                    });
99
100                                unmatched_results.extend(iter);
101                                unmatched_results.reverse();
102                                unconsumed_end_index = None;
103                            }
104                        } else {
105                            if end_index >= minimum_matched_start_index {
106                                if let None = unconsumed_end_index {
107                                    unconsumed_end_index = Some(end_index);
108                                }
109                            }
110                        }
111                    },
112                    BehaviorForUnmatched::Ignore => (),
113                }
114
115                unmatched_results
116            };
117
118            results.append(&mut unmatched_results);
119            results.append(&mut matched_results);
120
121            end_index = next_end_index;
122        } else {
123            end_index -= 1;
124        }
125    }
126    if minimum_matched_start_index > 0 {
127        // 处理text剩余的文本
128        match behavior_for_unmatched {
129            BehaviorForUnmatched::KeepAsWords => {
130                results.push(Match::new(
131                    TextRange::new(
132                        0,
133                        minimum_matched_start_index,
134                    ),
135                    None
136                ));
137            },
138            BehaviorForUnmatched::KeepAsChars => {
139                let iter = split_as_char_ranges(&text[0..minimum_matched_start_index])
140                    .map(|range| {
141                        Match::new(
142                            TextRange::new(
143                                range.start_index(),
144                                range.end_index(),
145                            ),
146                            None
147                        )
148                    });
149
150                results.extend(iter);
151            }
152            BehaviorForUnmatched::Ignore => (),
153        }
154    }
155
156    results.reverse();
157
158    results
159}
160
161#[cfg(test)]
162mod tests {
163    use crate::BehaviorForUnmatched;
164    use crate::hashmap::{
165        segment_backward_longest,
166        Dictionary,
167    };
168
169    #[test]
170    fn test_ignore_unmatched() {
171        let text = " 商品和服务, hello world ";
172        let dict = Dictionary::new(
173            vec!["商品", "和服", "服务", "你好世界"]
174        ).unwrap();
175
176        let result = segment_backward_longest(
177            text,
178            &dict,
179            BehaviorForUnmatched::Ignore
180        );
181
182        assert_eq!(
183            result
184                .iter()
185                .map(|x| x.range().extract(text).unwrap())
186                .collect::<Vec<_>>(),
187            vec!["商品", "服务",]
188        );
189    }
190
191    #[test]
192    fn test_keep_unmatched_as_chars() {
193        let text = " 商品和服务, hello world ";
194        let dict = Dictionary::new(
195            vec!["商品", "和服", "服务", "你好世界"]
196        ).unwrap();
197
198        let result = segment_backward_longest(
199            text,
200            &dict,
201            BehaviorForUnmatched::KeepAsChars
202        );
203
204        assert_eq!(
205            result
206                .into_iter()
207                .map(|x| x.range().extract(text).unwrap())
208                .collect::<Vec<_>>(),
209            vec![
210                " ",
211                "商品",
212                "和",
213                "服务",
214                ",",
215                " ",
216                "h",
217                "e",
218                "l",
219                "l",
220                "o",
221                " ",
222                "w",
223                "o",
224                "r",
225                "l",
226                "d",
227                " ",
228            ]
229        );
230    }
231
232    #[test]
233    fn test_keep_unmatched_as_words() {
234        let text = " 商品和服务, hello world ";
235        let dict = Dictionary::new(
236            vec!["商品", "和服", "服务", "你好世界"]
237        ).unwrap();
238
239        let result = segment_backward_longest(
240            text,
241            &dict,
242            BehaviorForUnmatched::KeepAsWords
243        );
244
245        assert_eq!(
246            result
247                .into_iter()
248                .map(|x| x.range().extract(text).unwrap())
249                .collect::<Vec<_>>(),
250            vec![
251                " ",
252                "商品",
253                "和",
254                "服务",
255                ", hello world ",
256            ]
257        );
258    }
259
260    #[test]
261    fn test_value() {
262        let text = " 商品和服务, hello world ";
263        let dict = Dictionary::new(
264            vec![
265                "商品",
266                "和服",
267                "服务",
268                "你好世界",
269            ]
270        ).unwrap();
271
272        let result = segment_backward_longest(
273            text,
274            &dict,
275            BehaviorForUnmatched::Ignore
276        );
277
278        assert_eq!(
279            result
280                .into_iter()
281                .map(|x| x.index_of_patterns().unwrap())
282                .collect::<Vec<_>>(),
283            vec![0, 2]
284        );
285    }
286
287    #[test]
288    fn test_chars_on_edge() {
289        let text = "你好世界";
290        let dict = Dictionary::new(
291            vec!["你好", "世界"]
292        ).unwrap();
293
294        let result = segment_backward_longest(
295            text,
296            &dict,
297            BehaviorForUnmatched::Ignore
298        );
299
300        assert_eq!(
301            result
302                .into_iter()
303                .map(|x| x.range().extract(text).unwrap())
304                .collect::<Vec<_>>(),
305            vec!["你好", "世界"]
306        );
307    }
308}