ultra_nlp/_cedarwood/
segment_fully.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched, UltraNLPError,
6};
7use crate::cedarwood::ForwardDictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_fully<T: AsRef<str>>(
11    text: T,
12    dict: &ForwardDictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text.as_ref().to_lowercase();
16    let mut results: Vec<Match> = vec![];
17
18    let mut unconsumed_word_start_index: Option<usize> = None;
19    let mut unconsumed_char_start_index: Option<usize> = None;
20    let mut maximum_matched_end_index = 0;
21    (0..text.len()).for_each(|start_index| {
22        if text.is_char_boundary(start_index) {
23            let mut matched_results: Vec<Match> = vec![];
24            // 注意, 虽然不知道这个Option的意义, 但Option是Some不代表matches非空.
25            if let Some(matches) = dict.dat.common_prefix_search(&text[start_index..]) {
26                matches
27                    .into_iter()
28                    .for_each(|(id, length)| {
29                        let range = TextRange::new(
30                            start_index,
31                            start_index + length + 1
32                        );
33                        let value = usize::try_from(id)
34                            .map_err(|err| UltraNLPError::new(err.to_string()))
35                            // 没有使用负数值, 且usize的最大值大于i32, 转换应当总是能成功
36                            .unwrap();
37
38                        let result = Match::new(range, Some(value));
39                        matched_results.push(result);
40
41                        if range.end_index() > maximum_matched_end_index {
42                            maximum_matched_end_index = range.end_index();
43                        }
44                    });
45            }
46
47            let mut unmatched_results: Vec<Match> = {
48                let mut unmatched_results = vec![];
49
50                match behavior_for_unmatched {
51                    BehaviorForUnmatched::KeepAsWords => {
52                        if matched_results.len() > 0 {
53                            // 将之前未消耗的word作为Match提交
54                            if let Some(index) = unconsumed_word_start_index {
55                                let result = Match::new(
56                                    TextRange::new(index, start_index),
57                                    None,
58                                );
59                                unmatched_results.push(result);
60                                unconsumed_word_start_index = None;
61                            }
62                        } else {
63                            if start_index >= maximum_matched_end_index {
64                                if let None = unconsumed_word_start_index {
65                                    unconsumed_word_start_index = Some(start_index);
66                                }
67                            }
68                        }
69                    },
70                    BehaviorForUnmatched::KeepAsChars => {
71                        if matched_results.len() > 0{
72                            // 将之前未消耗的char作为Match提交
73                            if let Some(index) = unconsumed_char_start_index {
74                                let result = Match::new(
75                                    TextRange::new(index, start_index),
76                                    None,
77                                );
78                                results.push(result);
79                                unconsumed_char_start_index = None;
80                            }
81                        } else {
82                            if start_index >= maximum_matched_end_index {
83                                if let None = unconsumed_char_start_index {
84                                    unconsumed_char_start_index = Some(start_index);
85                                }
86                            }
87                        }
88                    },
89                    BehaviorForUnmatched::Ignore => (),
90                }
91
92                unmatched_results
93            };
94
95            results.append(&mut unmatched_results);
96            results.append(&mut matched_results);
97        }
98    });
99    if maximum_matched_end_index < text.len() {
100        // 处理text剩余的文本
101        match behavior_for_unmatched {
102            BehaviorForUnmatched::KeepAsWords => {
103                results.push(Match::new(
104                    TextRange::new(maximum_matched_end_index, text.len()),
105                    None
106                ))
107            },
108            BehaviorForUnmatched::KeepAsChars => {
109                let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
110                    .map(|range| {
111                        Match::new(
112                            TextRange::new(
113                                maximum_matched_end_index + range.start_index(),
114                                maximum_matched_end_index + range.end_index(),
115                            ),
116                            None
117                        )
118                    });
119
120                results.extend(iter);
121            }
122            BehaviorForUnmatched::Ignore => (),
123        }
124    }
125
126    results
127}
128
129#[cfg(test)]
130mod tests {
131    use crate::BehaviorForUnmatched;
132    use crate::cedarwood::{
133        segment_fully,
134        ForwardDictionary,
135    };
136
137    #[test]
138    fn test_ignore_unmatched() {
139        let text = " 南京市长江大桥, hello world ";
140        let dict = ForwardDictionary::new(
141            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
142        ).unwrap();
143
144        let result = segment_fully(
145            text,
146            &dict,
147            BehaviorForUnmatched::Ignore
148        );
149
150        assert_eq!(
151            result
152                .into_iter()
153                .map(|x| x.range().extract(text).unwrap())
154                .collect::<Vec<_>>(),
155            vec!["南京", "南京市", "市长", "长江", "大桥"]
156        );
157    }
158
159    #[test]
160    fn test_keep_unmatched_as_chars() {
161        let text = " 南京市长江大桥, hello world ";
162        let dict = ForwardDictionary::new(
163            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
164        ).unwrap();
165
166        let result = segment_fully(
167            text,
168            &dict,
169            BehaviorForUnmatched::KeepAsChars
170        );
171
172        assert_eq!(
173            result
174                .into_iter()
175                .map(|x| x.range().extract(text).unwrap())
176                .collect::<Vec<_>>(),
177            vec![
178                " ",
179                "南京",
180                "南京市",
181                "市长",
182                "长江",
183                "大桥",
184                ",",
185                " ",
186                "h",
187                "e",
188                "l",
189                "l",
190                "o",
191                " ",
192                "w",
193                "o",
194                "r",
195                "l",
196                "d",
197                " ",
198            ]
199        );
200    }
201
202    #[test]
203    fn test_keep_unmatched_as_words() {
204        let text = " 南京市长江大桥, hello world ";
205        let dict = ForwardDictionary::new(
206            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
207        ).unwrap();
208
209        let result = segment_fully(
210            text,
211            &dict,
212            BehaviorForUnmatched::KeepAsWords
213        );
214
215        assert_eq!(
216            result
217                .into_iter()
218                .map(|x| x.range().extract(text).unwrap())
219                .collect::<Vec<_>>(),
220            vec![
221                " ",
222                "南京",
223                "南京市",
224                "市长",
225                "长江",
226                "大桥",
227                ", hello world ",
228            ]
229        );
230    }
231
232    #[test]
233    fn test_value() {
234        let text = " 南京市长江大桥, hello world ";
235        let dict = ForwardDictionary::new(
236            vec![
237                "南京",
238                "南京市",
239                "市长",
240                "长江",
241                "大桥",
242                "你好世界",
243            ]
244        ).unwrap();
245
246        let result = segment_fully(
247            text,
248            &dict,
249            BehaviorForUnmatched::Ignore
250        );
251
252        assert_eq!(
253            result
254                .into_iter()
255                .map(|x| x.index_of_patterns().unwrap())
256                .collect::<Vec<_>>(),
257            vec![
258                0,
259                1,
260                2,
261                3,
262                4,
263            ]
264        );
265    }
266
267    #[test]
268    fn test_chars_on_edge() {
269        let text = "你好世界";
270        let dict = ForwardDictionary::new(
271            vec!["你好", "世界"]
272        ).unwrap();
273
274        let result = segment_fully(
275            text,
276            &dict,
277            BehaviorForUnmatched::Ignore
278        );
279
280        assert_eq!(
281            result
282                .into_iter()
283                .map(|x| x.range().extract(text).unwrap())
284                .collect::<Vec<_>>(),
285            vec!["你好", "世界"]
286        );
287    }
288}