ultra_nlp/
extract_consecutive_chinese_chars.rs1use lazy_static::lazy_static;
2use regex::Regex;
3
4pub fn extract_consecutive_chinese_chars(text: &str) -> impl Iterator<Item = &str>{
5 lazy_static! {
6 static ref RE: Regex = Regex::new(r"\P{Script=Han}+").unwrap();
7 }
8
9 let result = RE
10 .split(text)
11 .filter(|x| !x.is_empty());
12
13 result
14}
15
16#[cfg(test)]
17mod tests {
18 mod extract_consecutive_chinese_chars {
19 use crate::extract_consecutive_chinese_chars::extract_consecutive_chinese_chars;
20
21 #[test]
22 fn test_extract_chinese_chars() {
23 let text = "foo中文,bar,字符baz";
24
25 let result = extract_consecutive_chinese_chars(text);
26
27 assert_eq!(
28 result
29 .into_iter()
30 .collect::<Vec<&str>>(),
31 vec!["中文", "字符"]
32 );
33 }
34 }
35}