ultra_nlp/_daachorse/
segment_backward_longest.rs

1use crate::{
2    Match,
3    TextRange,
4    BehaviorForUnmatched,
5    utils::split_as_char_ranges,
6};
7use crate::daachorse::BackwardDictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_backward_longest<T: AsRef<str>>(
11    text: T,
12    dict: &BackwardDictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text
16        .as_ref()
17        .to_lowercase()
18        .chars()
19        .rev()
20        .collect::<String>();
21
22    let mut results: Vec<Match> = vec![];
23
24    let mut start_index = 0;
25    while start_index < text.len() {
26        if text.is_char_boundary(start_index) {
27            let mut iter = dict.acdat.leftmost_find_iter(&text[start_index..]);
28
29            match iter.next() {
30                Some(mat) => {
31                    let real_mat_start_index = start_index + mat.start();
32                    let real_mat_end_index = start_index + mat.end();
33                    let result = Match::new(
34                        TextRange::new(
35                            text.len() - real_mat_end_index,
36                            text.len() - real_mat_start_index,
37                        ),
38                        Some(mat.value())
39                    );
40
41                    if mat.start() > 0 {
42                        // 处理匹配结果之前的文本
43                        match behavior_for_unmatched {
44                            BehaviorForUnmatched::Ignore => {},
45                            BehaviorForUnmatched::KeepAsWords => {
46                                let result = Match::new(
47                                    TextRange::new(
48                                        text.len() - (start_index + mat.start()),
49                                        text.len() - start_index,
50                                    ),
51                                    None,
52                                );
53                                results.push(result);
54                            },
55                            BehaviorForUnmatched::KeepAsChars => {
56                                let iter = split_as_char_ranges(&text[start_index..start_index + mat.start()])
57                                    .map(|range| {
58                                        Match::new(
59                                            TextRange::new(
60                                                text.len() - (start_index + range.end_index()),
61                                                text.len() - (start_index + range.start_index()),
62                                            ),
63                                            None,
64                                        )
65                                    });
66
67                                results.extend(iter);
68                            },
69                        }
70                    }
71
72                    start_index = real_mat_end_index;
73
74                    results.push(result);
75                }
76                None => {
77                    // 处理text剩余的文本
78                    match behavior_for_unmatched {
79                        BehaviorForUnmatched::Ignore => {},
80                        BehaviorForUnmatched::KeepAsWords => {
81                            results.push(
82                                Match::new(
83                                    TextRange::new(
84                                        0,
85                                        text.len() - start_index
86                                    ),
87                                    None,
88                                )
89                            );
90                        },
91                        BehaviorForUnmatched::KeepAsChars => {
92                            let iter = split_as_char_ranges(&text[start_index..])
93                                .map(|range| {
94                                    Match::new(
95                                        TextRange::new(
96                                            text.len() - (start_index + range.end_index()),
97                                            text.len() - (start_index + range.start_index()),
98                                        ),
99                                        None,
100                                    )
101                                });
102
103                            results.extend(iter);
104                        },
105                    }
106
107                    start_index += 1;
108                }
109            }
110        } else {
111            start_index += 1;
112        }
113    }
114
115    results.reverse();
116
117    results
118}
119
120#[cfg(test)]
121mod tests {
122    use crate::BehaviorForUnmatched;
123    use crate::daachorse::{
124        segment_backward_longest,
125        BackwardDictionary,
126    };
127
128    #[test]
129    fn test_ignore_unmatched() {
130        let text = " 商品和服务, hello world ";
131        let dict = BackwardDictionary::new(
132            vec!["商品", "和服", "服务", "你好世界"]
133        ).unwrap();
134
135        let result = segment_backward_longest(
136            text,
137            &dict,
138            BehaviorForUnmatched::Ignore
139        );
140
141        assert_eq!(
142            result
143                .into_iter()
144                .map(|x| x.range().extract(text).unwrap())
145                .collect::<Vec<_>>(),
146            vec!["商品", "服务",]
147        );
148    }
149
150    #[test]
151    fn test_keep_unmatched_as_chars() {
152        let text = " 商品和服务, hello world ";
153        let dict = BackwardDictionary::new(
154            vec!["商品", "和服", "服务", "你好世界"]
155        ).unwrap();
156
157        let result = segment_backward_longest(
158            text,
159            &dict,
160            BehaviorForUnmatched::KeepAsChars
161        );
162
163        assert_eq!(
164            result
165                .into_iter()
166                .map(|x| x.range().extract(text).unwrap())
167                .collect::<Vec<_>>(),
168            vec![
169                " ",
170                "商品",
171                "和",
172                "服务",
173                ",",
174                " ",
175                "h",
176                "e",
177                "l",
178                "l",
179                "o",
180                " ",
181                "w",
182                "o",
183                "r",
184                "l",
185                "d",
186                " ",
187            ]
188        );
189    }
190
191    #[test]
192    fn test_keep_unmatched_as_words() {
193        let text = " 商品和服务, hello world ";
194        let dict = BackwardDictionary::new(
195            vec!["商品", "和服", "服务", "你好世界"]
196        ).unwrap();
197
198        let result = segment_backward_longest(
199            text,
200            &dict,
201            BehaviorForUnmatched::KeepAsWords
202        );
203
204        assert_eq!(
205            result
206                .into_iter()
207                .map(|x| x.range().extract(text).unwrap())
208                .collect::<Vec<_>>(),
209            vec![
210                " ",
211                "商品",
212                "和",
213                "服务",
214                ", hello world ",
215            ]
216        );
217    }
218
219    #[test]
220    fn test_value() {
221        let text = " 商品和服务, hello world ";
222        let dict = BackwardDictionary::new(
223            vec![
224                "商品",
225                "和服",
226                "服务",
227                "你好世界",
228            ]
229        ).unwrap();
230
231        let result = segment_backward_longest(
232            text,
233            &dict,
234            BehaviorForUnmatched::Ignore
235        );
236
237        assert_eq!(
238            result
239                .into_iter()
240                .map(|x| x.index_of_patterns().unwrap())
241                .collect::<Vec<_>>(),
242            vec![0, 2]
243        );
244    }
245
246    #[test]
247    fn test_chars_on_edge() {
248        let text = "你好世界";
249        let dict = BackwardDictionary::new(
250            vec!["你好", "世界"]
251        ).unwrap();
252
253        let result = segment_backward_longest(
254            text,
255            &dict,
256            BehaviorForUnmatched::Ignore
257        );
258
259        assert_eq!(
260            result
261                .into_iter()
262                .map(|x| x.range().extract(text).unwrap())
263                .collect::<Vec<_>>(),
264            vec!["你好", "世界"]
265        );
266    }
267}