ultra_nlp/_daachorse/
segment_forward_longest.rs

1use crate::{
2    Match,
3    TextRange,
4    BehaviorForUnmatched,
5    utils::split_as_char_ranges,
6};
7use crate::daachorse::ForwardDictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_forward_longest<T: AsRef<str>>(
11    text: T,
12    dict: &ForwardDictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text.as_ref().to_lowercase();
16    let mut results: Vec<Match> = vec![];
17
18    let mut start_index = 0;
19    while start_index < text.len() {
20        if text.is_char_boundary(start_index) {
21            let mut iter = dict.acdat.leftmost_find_iter(&text[start_index..]);
22
23            match iter.next() {
24                Some(mat) => {
25                    let real_mat_start_index = start_index + mat.start();
26                    let real_mat_end_index = start_index + mat.end();
27                    let result = Match::new(
28                        TextRange::new(real_mat_start_index, real_mat_end_index),
29                        Some(mat.value())
30                    );
31
32                    if mat.start() > 0 {
33                        // 处理匹配结果之前的文本
34                        match behavior_for_unmatched {
35                            BehaviorForUnmatched::Ignore => {},
36                            BehaviorForUnmatched::KeepAsWords => {
37                                results.push(
38                                    Match::new(
39                                        TextRange::new(
40                                            start_index, start_index + mat.start()
41                                        ),
42                                        None
43                                    )
44                                );
45                            },
46                            BehaviorForUnmatched::KeepAsChars => {
47                                let iter = split_as_char_ranges(
48                                    &text[start_index..start_index + mat.start()]
49                                )
50                                    .map(|range| {
51                                        Match::new(
52                                            TextRange::new(
53                                                start_index + range.start_index(),
54                                                start_index + range.end_index(),
55                                            ),
56                                            None,
57                                        )
58                                    });
59
60                                results.extend(iter);
61                            },
62                        }
63                    }
64
65                    start_index = real_mat_end_index;
66
67                    results.push(result);
68                }
69                None => {
70                    // 处理text剩余的文本
71                    match behavior_for_unmatched {
72                        BehaviorForUnmatched::Ignore => {},
73                        BehaviorForUnmatched::KeepAsWords => {
74                            results.push(
75                                Match::new(
76                                    TextRange::new(
77                                        start_index,
78                                        text.len()
79                                    ),
80                                    None,
81                                )
82                            );
83                        },
84                        BehaviorForUnmatched::KeepAsChars => {
85                            let iter = split_as_char_ranges(&text[start_index..])
86                                .map(|range| {
87                                    Match::new(
88                                        TextRange::new(
89                                            start_index + range.start_index(),
90                                            start_index + range.end_index(),
91                                        ),
92                                        None,
93                                    )
94                                });
95
96                            results.extend(iter);
97                        },
98                    }
99
100                    break;
101                }
102            }
103        } else {
104            start_index += 1;
105        }
106    }
107
108    results
109}
110
111#[cfg(test)]
112mod tests {
113    use crate::BehaviorForUnmatched;
114    use crate::daachorse::{
115        segment_forward_longest,
116        ForwardDictionary,
117    };
118
119    #[test]
120    fn test_ignore_unmatched() {
121        let text = " 商品和服务, hello world ";
122        let dict = ForwardDictionary::new(
123            vec!["商品", "和服", "服务", "你好世界"]
124        ).unwrap();
125
126        let result = segment_forward_longest(
127            text,
128            &dict,
129            BehaviorForUnmatched::Ignore
130        );
131
132        assert_eq!(
133            result
134                .into_iter()
135                .map(|x| x.range().extract(text).unwrap())
136                .collect::<Vec<_>>(),
137            vec!["商品", "和服",]
138        );
139    }
140
141    #[test]
142    fn test_keep_unmatched_as_chars() {
143        let text = " 商品和服务, hello world ";
144        let dict = ForwardDictionary::new(
145            vec!["商品", "和服", "服务", "你好世界"]
146        ).unwrap();
147
148        let result = segment_forward_longest(
149            text,
150            &dict,
151            BehaviorForUnmatched::KeepAsChars
152        );
153
154        assert_eq!(
155            result
156                .into_iter()
157                .map(|x| x.range().extract(text).unwrap())
158                .collect::<Vec<_>>(),
159            vec![
160                " ",
161                "商品",
162                "和服",
163                "务",
164                ",",
165                " ",
166                "h",
167                "e",
168                "l",
169                "l",
170                "o",
171                " ",
172                "w",
173                "o",
174                "r",
175                "l",
176                "d",
177                " ",
178            ],
179        );
180    }
181
182    #[test]
183    fn test_keep_unmatched_as_words() {
184        let text = " 商品和服务, hello world ";
185        let dict = ForwardDictionary::new(
186            vec!["商品", "和服", "服务", "你好世界"]
187        ).unwrap();
188
189        let result = segment_forward_longest(
190            text,
191            &dict,
192            BehaviorForUnmatched::KeepAsWords
193        );
194
195        assert_eq!(
196            result
197                .into_iter()
198                .map(|x| x.range().extract(text).unwrap())
199                .collect::<Vec<_>>(),
200            vec![
201                " ",
202                "商品",
203                "和服",
204                "务, hello world ",
205            ],
206        );
207    }
208
209    #[test]
210    fn test_value() {
211        let text = " 商品和服务, hello world ";
212        let dict = ForwardDictionary::new(
213            vec![
214                "商品",
215                "和服",
216                "服务",
217                "你好世界",
218            ]
219        ).unwrap();
220
221        let result = segment_forward_longest(
222            text,
223            &dict,
224            BehaviorForUnmatched::Ignore
225        );
226
227        assert_eq!(
228            result
229                .into_iter()
230                .map(|x| x.index_of_patterns().unwrap())
231                .collect::<Vec<_>>(),
232            vec![0, 1]
233        );
234    }
235
236    #[test]
237    fn test_chars_on_edge() {
238        let text = "你好世界";
239        let dict = ForwardDictionary::new(
240            vec!["你好", "世界"]
241        ).unwrap();
242
243        let result = segment_forward_longest(
244            text,
245            &dict,
246            BehaviorForUnmatched::Ignore
247        );
248
249        assert_eq!(
250            result
251                .into_iter()
252                .map(|x| x.range().extract(text).unwrap())
253                .collect::<Vec<_>>(),
254            vec!["你好", "世界"]
255        );
256    }
257}