ultra_nlp/_cedarwood/
segment_forward_longest.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched,
6    UltraNLPError,
7};
8use crate::cedarwood::ForwardDictionary;
9
10// 待generator稳定, 改为generator, 以便返回Iterator.
11pub fn segment_forward_longest<T: AsRef<str>>(
12    text: T,
13    dict: &ForwardDictionary,
14    behavior_for_unmatched: BehaviorForUnmatched,
15) -> Vec<Match> {
16    let text = text
17        .as_ref()
18        .to_lowercase();
19
20    let mut results: Vec<Match> = vec![];
21
22    let mut unconsumed_start_index: Option<usize> = None;
23    let mut maximum_matched_end_index = 0;
24    let mut start_index = 0;
25    while start_index < text.len() {
26        if text.is_char_boundary(start_index) {
27            let mut next_start_index = start_index + 1;
28
29            let mut matched_results: Vec<Match> = vec![];
30            // 注意, 虽然不知道这个Option的意义, 但Option是Some不代表matches非空.
31            if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
32                let longest_match: Option<(i32, usize)> = matches
33                    .into_iter()
34                    .reduce(| longest, current | {
35                        let (_, longest_length) = longest;
36                        let (_, current_length) = current;
37                        if current_length > longest_length {
38                            current
39                        } else {
40                            longest
41                        }
42                    });
43
44                if let Some((id, length)) = longest_match{
45                    let range = TextRange::new(
46                        start_index,
47                        start_index + length + 1
48                    );
49                    let value = usize::try_from(id)
50                        .map_err(|err| UltraNLPError::new(err.to_string()))
51                        // 没有使用负数值, 且usize的最大值大于i32, 转换应当总是能成功
52                        .unwrap();
53
54                    let result = Match::new(range, Some(value));
55                    matched_results.push(result);
56
57                    next_start_index = range.end_index();
58                    maximum_matched_end_index = range.end_index();
59                }
60            }
61
62            let mut unmatched_results: Vec<Match> = {
63                let mut unmatched_results = vec![];
64
65                match behavior_for_unmatched {
66                    BehaviorForUnmatched::KeepAsWords => {
67                        if matched_results.len() > 0 {
68                            // 将之前未消耗的word作为Match提交
69                            if let Some(index) = unconsumed_start_index {
70                                let result = Match::new(
71                                    TextRange::new(index, start_index),
72                                    None,
73                                );
74                                unmatched_results.push(result);
75                                unconsumed_start_index = None;
76                            }
77                        } else {
78                            if start_index >= maximum_matched_end_index {
79                                if let None = unconsumed_start_index {
80                                    unconsumed_start_index = Some(start_index);
81                                }
82                            }
83                        }
84                    },
85                    BehaviorForUnmatched::KeepAsChars => {
86                        if matched_results.len() > 0 {
87                            // 将之前未消耗的char作为Match提交
88                            if let Some(index) = unconsumed_start_index {
89                                let result = Match::new(
90                                    TextRange::new(index, start_index),
91                                    None,
92                                );
93                                unmatched_results.push(result);
94                                unconsumed_start_index = None;
95                            }
96                        } else {
97                            if start_index >= maximum_matched_end_index {
98                                if let None = unconsumed_start_index {
99                                    unconsumed_start_index = Some(start_index);
100                                }
101                            }
102                        }
103                    },
104                    BehaviorForUnmatched::Ignore => (),
105                }
106
107                unmatched_results
108            };
109
110            results.append(&mut unmatched_results);
111            results.append(&mut matched_results);
112
113            start_index = next_start_index;
114        } else {
115            start_index += 1;
116        }
117    }
118    if maximum_matched_end_index < text.len() {
119        // 处理text剩余的文本
120        match behavior_for_unmatched {
121            BehaviorForUnmatched::KeepAsWords => {
122                results.push(Match::new(
123                    TextRange::new(maximum_matched_end_index, text.len()),
124                    None
125                ))
126            },
127            BehaviorForUnmatched::KeepAsChars => {
128                let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
129                    .map(|range| {
130                        Match::new(
131                            TextRange::new(
132                                maximum_matched_end_index + range.start_index(),
133                                maximum_matched_end_index + range.end_index(),
134                            ),
135                            None,
136                        )
137                    });
138
139                results.extend(iter);
140            }
141            BehaviorForUnmatched::Ignore => (),
142        }
143    }
144
145    results
146}
147
148#[cfg(test)]
149mod tests {
150    use crate::BehaviorForUnmatched;
151    use crate::cedarwood::{
152        segment_forward_longest,
153        ForwardDictionary,
154    };
155
156    #[test]
157    fn test_ignore_unmatched() {
158        let text = " 商品和服务, hello world ";
159        let dict = ForwardDictionary::new(
160            vec!["商品", "和服", "服务", "你好世界"]
161        ).unwrap();
162
163        let result = segment_forward_longest(
164            text,
165            &dict,
166            BehaviorForUnmatched::Ignore
167        );
168
169        assert_eq!(
170            result
171                .into_iter()
172                .map(|x| x.range().extract(text).unwrap())
173                .collect::<Vec<_>>(),
174            vec!["商品", "和服",]
175        );
176    }
177
178    #[test]
179    fn test_keep_unmatched_as_chars() {
180        let text = " 商品和服务, hello world ";
181        let dict = ForwardDictionary::new(
182            vec!["商品", "和服", "服务", "你好世界"]
183        ).unwrap();
184
185        let result = segment_forward_longest(
186            text,
187            &dict,
188            BehaviorForUnmatched::KeepAsChars
189        );
190
191        assert_eq!(
192            result
193                .into_iter()
194                .map(|x| x.range().extract(text).unwrap())
195                .collect::<Vec<_>>(),
196            vec![
197                " ",
198                "商品",
199                "和服",
200                "务",
201                ",",
202                " ",
203                "h",
204                "e",
205                "l",
206                "l",
207                "o",
208                " ",
209                "w",
210                "o",
211                "r",
212                "l",
213                "d",
214                " ",
215            ],
216        );
217    }
218
219    #[test]
220    fn test_keep_unmatched_as_words() {
221        let text = " 商品和服务, hello world ";
222        let dict = ForwardDictionary::new(
223            vec!["商品", "和服", "服务", "你好世界"]
224        ).unwrap();
225
226        let result = segment_forward_longest(
227            text,
228            &dict,
229            BehaviorForUnmatched::KeepAsWords
230        );
231
232        assert_eq!(
233            result
234                .into_iter()
235                .map(|x| x.range().extract(text).unwrap())
236                .collect::<Vec<_>>(),
237            vec![
238                " ",
239                "商品",
240                "和服",
241                "务, hello world ",
242            ],
243        );
244    }
245
246    #[test]
247    fn test_value() {
248        let text = " 商品和服务, hello world ";
249        let dict = ForwardDictionary::new(
250            vec![
251                "商品",
252                "和服",
253                "服务",
254                "你好世界",
255            ]
256        ).unwrap();
257
258        let result = segment_forward_longest(
259            text,
260            &dict,
261            BehaviorForUnmatched::Ignore
262        );
263
264        assert_eq!(
265            result
266                .into_iter()
267                .map(|x| x.index_of_patterns().unwrap())
268                .collect::<Vec<_>>(),
269            vec![0, 1]
270        );
271    }
272
273    #[test]
274    fn test_chars_on_edge() {
275        let text = "你好世界";
276        let dict = ForwardDictionary::new(
277            vec!["你好", "世界"]
278        ).unwrap();
279
280        let result = segment_forward_longest(
281            text,
282            &dict,
283            BehaviorForUnmatched::Ignore
284        );
285
286        assert_eq!(
287            result
288                .into_iter()
289                .map(|x| x.range().extract(text).unwrap())
290                .collect::<Vec<_>>(),
291            vec!["你好", "世界"]
292        );
293    }
294}