ultra_nlp/_hashmap/
segment_forward_longest.rs

1use crate::utils::split_as_char_ranges;
2use crate::{
3    Match,
4    TextRange,
5    BehaviorForUnmatched,
6};
7use crate::hashmap::Dictionary;
8
9// 待generator稳定, 改为generator, 以便返回Iterator.
10pub fn segment_forward_longest<T: AsRef<str>>(
11    text: T,
12    dict: &Dictionary,
13    behavior_for_unmatched: BehaviorForUnmatched,
14) -> Vec<Match> {
15    let text = text
16        .as_ref()
17        .to_lowercase();
18
19    let mut results: Vec<Match> = vec![];
20
21    let mut unconsumed_start_index: Option<usize> = None;
22    let mut maximum_matched_end_index = 0;
23    let mut start_index = 0;
24    while start_index < text.len() {
25        if text.is_char_boundary(start_index) {
26            let mut next_start_index = start_index + 1;
27
28            let mut matched_results: Vec<Match> = vec![];
29            let mut longest_match: Option<(
30                usize, // end_index
31                usize, // value
32            )> = None;
33            ((start_index + 1)..=text.len())
34                .into_iter()
35                .for_each(|end_index| {
36                    if text.is_char_boundary(end_index) {
37                        let sub_text = &text[start_index..end_index];
38
39                        if let Some(value) = dict.map.get(sub_text) {
40                            longest_match = Some((end_index, *value))
41                        }
42                    }
43                });
44
45            if let Some((end_index, value)) = longest_match{
46                let range = TextRange::new(
47                    start_index,
48                    end_index
49                );
50
51                let result = Match::new(range, Some(value));
52                matched_results.push(result);
53
54                next_start_index = range.end_index();
55                maximum_matched_end_index = range.end_index();
56            }
57
58            let mut unmatched_results: Vec<Match> = vec![];
59            match behavior_for_unmatched {
60                BehaviorForUnmatched::KeepAsWords => {
61                    if matched_results.len() > 0 {
62                        // 将之前未消耗的word作为Match提交
63                        if let Some(index) = unconsumed_start_index {
64                            let result = Match::new(
65                                TextRange::new(index, start_index),
66                                None,
67                            );
68                            unmatched_results.push(result);
69                            unconsumed_start_index = None;
70                        }
71                    } else {
72                        if start_index >= maximum_matched_end_index {
73                            if let None = unconsumed_start_index {
74                                unconsumed_start_index = Some(start_index);
75                            }
76                        }
77                    }
78                },
79                BehaviorForUnmatched::KeepAsChars => {
80                    if matched_results.len() > 0 {
81                        // 将之前未消耗的char作为Match提交
82                        if let Some(index) = unconsumed_start_index {
83                            let result = Match::new(
84                                TextRange::new(index, start_index),
85                                None,
86                            );
87                            unmatched_results.push(result);
88                            unconsumed_start_index = None;
89                        }
90                    } else {
91                        if start_index >= maximum_matched_end_index {
92                            if let None = unconsumed_start_index {
93                                unconsumed_start_index = Some(start_index);
94                            }
95                        }
96                    }
97                },
98                BehaviorForUnmatched::Ignore => (),
99            }
100
101            results.append(&mut unmatched_results);
102            results.append(&mut matched_results);
103
104            start_index = next_start_index;
105        } else {
106            start_index += 1;
107        }
108    }
109    if maximum_matched_end_index < text.len() {
110        // 处理text剩余的文本
111        match behavior_for_unmatched {
112            BehaviorForUnmatched::KeepAsWords => {
113                results.push(Match::new(
114                    TextRange::new(maximum_matched_end_index, text.len()),
115                    None
116                ))
117            },
118            BehaviorForUnmatched::KeepAsChars => {
119                let iter = split_as_char_ranges(&text[maximum_matched_end_index..])
120                    .map(|range| {
121                        Match::new(
122                            TextRange::new(
123                                maximum_matched_end_index + range.start_index(),
124                                maximum_matched_end_index + range.end_index(),
125                            ),
126                            None
127                        )
128                    });
129
130                results.extend(iter);
131            }
132            BehaviorForUnmatched::Ignore => (),
133        }
134    }
135
136    results
137}
138
139#[cfg(test)]
140mod tests {
141    use crate::BehaviorForUnmatched;
142    use crate::hashmap::{
143        segment_forward_longest,
144        Dictionary,
145    };
146
147    #[test]
148    fn test_ignore_unmatched() {
149        let text = " 商品和服务, hello world ";
150        let dict = Dictionary::new(
151            vec!["商品", "和服", "服务", "你好世界"]
152        ).unwrap();
153
154        let result = segment_forward_longest(
155            text,
156            &dict,
157            BehaviorForUnmatched::Ignore
158        );
159
160        assert_eq!(
161            result
162                .into_iter()
163                .map(|x| x.range().extract(text).unwrap())
164                .collect::<Vec<_>>(),
165            vec!["商品", "和服",]
166        );
167    }
168
169    #[test]
170    fn test_keep_unmatched_as_chars() {
171        let text = " 商品和服务, hello world ";
172        let dict = Dictionary::new(
173            vec!["商品", "和服", "服务", "你好世界"]
174        ).unwrap();
175
176        let result = segment_forward_longest(
177            text,
178            &dict,
179            BehaviorForUnmatched::KeepAsChars
180        );
181
182        assert_eq!(
183            result
184                .into_iter()
185                .map(|x| x.range().extract(text).unwrap())
186                .collect::<Vec<_>>(),
187            vec![
188                " ",
189                "商品",
190                "和服",
191                "务",
192                ",",
193                " ",
194                "h",
195                "e",
196                "l",
197                "l",
198                "o",
199                " ",
200                "w",
201                "o",
202                "r",
203                "l",
204                "d",
205                " ",
206            ],
207        );
208    }
209
210    #[test]
211    fn test_keep_unmatched_as_words() {
212        let text = " 商品和服务, hello world ";
213        let dict = Dictionary::new(
214            vec!["商品", "和服", "服务", "你好世界"]
215        ).unwrap();
216
217        let result = segment_forward_longest(
218            text,
219            &dict,
220            BehaviorForUnmatched::KeepAsWords
221        );
222
223        assert_eq!(
224            result
225                .into_iter()
226                .map(|x| x.range().extract(text).unwrap())
227                .collect::<Vec<_>>(),
228            vec![
229                " ",
230                "商品",
231                "和服",
232                "务, hello world ",
233            ],
234        );
235    }
236
237    #[test]
238    fn test_value() {
239        let text = " 商品和服务, hello world ";
240        let dict = Dictionary::new(
241            vec![
242                "商品",
243                "和服",
244                "服务",
245                "你好世界",
246            ]
247        ).unwrap();
248
249        let result = segment_forward_longest(
250            text,
251            &dict,
252            BehaviorForUnmatched::Ignore
253        );
254
255        assert_eq!(
256            result
257                .into_iter()
258                .map(|x| x.index_of_patterns().unwrap())
259                .collect::<Vec<_>>(),
260            vec![0, 1]
261        );
262    }
263
264    #[test]
265    fn test_chars_on_edge() {
266        let text = "你好世界";
267        let dict = Dictionary::new(
268            vec!["你好", "世界"]
269        ).unwrap();
270
271        let result = segment_forward_longest(
272            text,
273            &dict,
274            BehaviorForUnmatched::Ignore
275        );
276
277        assert_eq!(
278            result
279                .into_iter()
280                .map(|x| x.range().extract(text).unwrap())
281                .collect::<Vec<_>>(),
282            vec!["你好", "世界"]
283        );
284    }
285}