ultra_nlp/
extract_consecutive_chinese_chars.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3
4pub fn extract_consecutive_chinese_chars(text: &str) -> impl Iterator<Item = &str>{
5    lazy_static! {
6        static ref RE: Regex = Regex::new(r"\P{Script=Han}+").unwrap();
7    }
8
9    let result = RE
10        .split(text)
11        .filter(|x| !x.is_empty());
12
13    result
14}
15
16#[cfg(test)]
17mod tests {
18    mod extract_consecutive_chinese_chars {
19        use crate::extract_consecutive_chinese_chars::extract_consecutive_chinese_chars;
20
21        #[test]
22        fn test_extract_chinese_chars() {
23            let text = "foo中文,bar,字符baz";
24
25            let result = extract_consecutive_chinese_chars(text);
26
27            assert_eq!(
28                result
29                    .into_iter()
30                    .collect::<Vec<&str>>(),
31                vec!["中文", "字符"]
32            );
33        }
34    }
35}