ultra_nlp/_hashmap/
segment_fully.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_fully<T: AsRef<str>>(
11    text: T,
12    dict: &Dictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text
16        .as_ref()
17        .to_lowercase();
18    let mut results: Vec<Match> = vec![];
19
20    let mut unconsumed_word_start_index: Option<usize> = None;
21    let mut unconsumed_char_start_index: Option<usize> = None;
22    let mut maximum_matched_end_index = 0;
23    (0..text.len()).into_iter().for_each(|start_index| {
24        if text.is_char_boundary(start_index) {
25            let mut matched_results: Vec<Match> = vec![];
26            ((start_index + 1)..=text.len())
27                .into_iter()
28                .for_each(|end_index| {
29                    if text.is_char_boundary(end_index) {
30                        let sub_text = &text[start_index..end_index];
31
32                        if let Some(value) = dict.map.get(sub_text) {
33                            let range = TextRange::new(
34                                start_index,
35                                end_index
36                            );
37
38                            let result = Match::new(range, Some(*value));
39                            matched_results.push(result);
40
41                            if range.end_index() > maximum_matched_end_index {
42                                maximum_matched_end_index = range.end_index();
43                            }
44                        }
45                    }
46                });
47
48            let mut unmatched_results: Vec<Match> = {
49                let mut unmatched_results = vec![];
50
51                match behavior_for_unmatched {
52                    BehaviorForUnmatched::KeepAsWords => {
53                        if matched_results.len() > 0 {
54                            // 将之前未消耗的word作为Match提交
55                            if let Some(index) = unconsumed_word_start_index {
56                                let result = Match::new(
57                                    TextRange::new(index, start_index),
58                                    None,
59                                );
60                                unmatched_results.push(result);
61                                unconsumed_word_start_index = None;
62                            }
63                        } else {
64                            if start_index >= maximum_matched_end_index {
65                                if let None = unconsumed_word_start_index {
66                                    unconsumed_word_start_index = Some(start_index);
67                                }
68                            }
69                        }
70                    },
71                    BehaviorForUnmatched::KeepAsChars => {
72                        if matched_results.len() > 0{
73                            // 将之前未消耗的char作为Match提交
74                            if let Some(index) = unconsumed_char_start_index {
75                                let result = Match::new(
76                                    TextRange::new(index, start_index),
77                                    None,
78                                );
79                                results.push(result);
80                                unconsumed_char_start_index = None;
81                            }
82                        } else {
83                            if start_index >= maximum_matched_end_index {
84                                if let None = unconsumed_char_start_index {
85                                    unconsumed_char_start_index = Some(start_index);
86                                }
87                            }
88                        }
89                    },
90                    BehaviorForUnmatched::Ignore => (),
91                }
92
93                unmatched_results
94            };
95
96            results.append(&mut unmatched_results);
97            results.append(&mut matched_results);
98        }
99    });
100    if maximum_matched_end_index < text.len() {
101        // 处理text剩余的文本
102        match behavior_for_unmatched {
103            BehaviorForUnmatched::KeepAsWords => {
104                results.push(Match::new(
105                    TextRange::new(maximum_matched_end_index, text.len()),
106                    None
107                ))
108            },
109            BehaviorForUnmatched::KeepAsChars => {
110                let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
111                    .map(|range| {
112                        Match::new(
113                            TextRange::new(
114                                maximum_matched_end_index + range.start_index(),
115                                maximum_matched_end_index + range.end_index(),
116                            ),
117                            None
118                        )
119                    });
120
121                results.extend(iter);
122            }
123            BehaviorForUnmatched::Ignore => (),
124        }
125    }
126
127    results
128}
129
130#[cfg(test)]
131mod tests {
132    use crate::BehaviorForUnmatched;
133    use crate::hashmap::{
134        segment_fully,
135        Dictionary,
136    };
137
138    #[test]
139    fn test_ignore_unmatched() {
140        let text = " 南京市长江大桥, hello world ";
141        let dict = Dictionary::new(
142            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
143        ).unwrap();
144
145        let result = segment_fully(
146            text,
147            &dict,
148            BehaviorForUnmatched::Ignore
149        );
150
151        assert_eq!(
152            result
153                .into_iter()
154                .map(|x| x.range().extract(text).unwrap())
155                .collect::<Vec<_>>(),
156            vec!["南京", "南京市", "市长", "长江", "大桥"]
157        );
158    }
159
160    #[test]
161    fn test_keep_unmatched_as_chars() {
162        let text = " 南京市长江大桥, hello world ";
163        let dict = Dictionary::new(
164            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
165        ).unwrap();
166
167        let result = segment_fully(
168            text,
169            &dict,
170            BehaviorForUnmatched::KeepAsChars
171        );
172
173        assert_eq!(
174            result
175                .into_iter()
176                .map(|x| x.range().extract(text).unwrap())
177                .collect::<Vec<_>>(),
178            vec![
179                " ",
180                "南京",
181                "南京市",
182                "市长",
183                "长江",
184                "大桥",
185                ",",
186                " ",
187                "h",
188                "e",
189                "l",
190                "l",
191                "o",
192                " ",
193                "w",
194                "o",
195                "r",
196                "l",
197                "d",
198                " ",
199            ]
200        );
201    }
202
203    #[test]
204    fn test_keep_unmatched_as_words() {
205        let text = " 南京市长江大桥, hello world ";
206        let dict = Dictionary::new(
207            vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
208        ).unwrap();
209
210        let result = segment_fully(
211            text,
212            &dict,
213            BehaviorForUnmatched::KeepAsWords
214        );
215
216        assert_eq!(
217            result
218                .into_iter()
219                .map(|x| x.range().extract(text).unwrap())
220                .collect::<Vec<_>>(),
221            vec![
222                " ",
223                "南京",
224                "南京市",
225                "市长",
226                "长江",
227                "大桥",
228                ", hello world ",
229            ]
230        );
231    }
232
233    #[test]
234    fn test_value() {
235        let text = " 南京市长江大桥, hello world ";
236        let dict = Dictionary::new(
237            vec![
238                "南京",
239                "南京市",
240                "市长",
241                "长江",
242                "大桥",
243                "你好世界",
244            ]
245        ).unwrap();
246
247        let result = segment_fully(
248            text,
249            &dict,
250            BehaviorForUnmatched::Ignore
251        );
252
253        assert_eq!(
254            result
255                .into_iter()
256                .map(|x| x.index_of_patterns().unwrap())
257                .collect::<Vec<_>>(),
258            vec![
259                0,
260                1,
261                2,
262                3,
263                4,
264            ]
265        );
266    }
267
268    #[test]
269    fn test_chars_on_edge() {
270        let text = "你好世界";
271        let dict = Dictionary::new(
272            vec!["你好", "世界"]
273        ).unwrap();
274
275        let result = segment_fully(
276            text,
277            &dict,
278            BehaviorForUnmatched::Ignore
279        );
280
281        assert_eq!(
282            result
283                .into_iter()
284                .map(|x| x.range().extract(text).unwrap())
285                .collect::<Vec<_>>(),
286            vec!["你好", "世界"]
287        );
288    }
289}