use std::sync::LazyLock;
use jieba_rs::Jieba;
use crate::segmenter::Segmenter;
pub struct ChineseSegmenter;
fn next_gram<const N: usize>(s: &str) -> Option<&str> {
match s.char_indices().nth(N - 1) {
Some((byte_index, c)) => Some(&s[0..(byte_index + c.len_utf8())]),
None => None,
}
}
fn cut_for_search<'a>(s: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
if s.chars().count() <= 2 {
return Box::new(std::iter::once(s));
}
if s.chars().all(|c| c.is_ascii_alphanumeric()) {
return Box::new(std::iter::once(s));
}
let mut subwords = Vec::new();
let mut index = 0;
loop {
if let Some(bigram) = next_gram::<2>(&s[index..]).filter(|sub| JIEBA.has_word(sub)) {
index += bigram.len();
subwords.push(bigram);
} else if let Some(trigram) = next_gram::<3>(&s[index..]).filter(|sub| JIEBA.has_word(sub))
{
index += trigram.len();
subwords.push(trigram);
} else if let Some(single) = next_gram::<1>(&s[index..]) {
index += single.len();
subwords.push(single);
} else {
break;
}
}
Box::new(subwords.into_iter())
}
impl Segmenter for ChineseSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
let segmented: Vec<&str> = JIEBA
.cut(to_segment, false) .into_iter()
.flat_map(|x| cut_for_search(x))
.collect();
Box::new(segmented.into_iter())
}
}
static JIEBA: LazyLock<Jieba> = LazyLock::new(Jieba::new);
#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;
const TEXT: &str =
"人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。人民的意志是政府权力的基础,这一意志应以定期的和真正的选举予以表现。夏天,像是哼着小曲的少年,恶作剧般在大地上洒满每一种灿烂的颜色。 123 456。";
const SEGMENTED: &[&str] = &[
"人人",
"生",
"而",
"自由",
"﹐",
"在",
"尊",
"嚴",
"和",
"權",
"利",
"上",
"一律",
"平等",
"。",
"他",
"們",
"賦",
"有",
"理性",
"和",
"良心",
"﹐",
"並",
"應",
"以",
"兄弟",
"關",
"係",
"的",
"精神",
"互相",
"對",
"待",
"。",
"人民",
"的",
"意志",
"是",
"政府",
"权力",
"的",
"基础",
",",
"这",
"一",
"意志",
"应",
"以",
"定期",
"的",
"和",
"真正",
"的",
"选举",
"予以",
"表现",
"。",
"夏天",
",",
"像是",
"哼",
"着",
"小曲",
"的",
"少年",
",",
"恶作剧",
"般",
"在",
"大",
"地上",
"洒满",
"每",
"一种",
"灿烂",
"的",
"颜色",
"。",
" ",
"123",
" ",
"456",
"。",
];
#[cfg(feature = "chinese-normalization-pinyin")]
const TOKENIZED: &[&str] = &[
"rénrén",
"shēng",
"ér",
"zìyóu",
",",
"zài",
"zūn",
"yán",
"hé",
"quán",
"lì",
"shàng",
"yīlǜ",
"píngděng",
"。",
"tā",
"men",
"fù",
"yǒu",
"lǐxìng",
"hé",
"liángxīn",
",",
"bìng",
"yīng",
"yǐ",
"xiōngdì",
"guān",
"xì",
"de",
"jīngshén",
"hùxiāng",
"duì",
"dài",
"。",
"rénmín",
"de",
"yìzhì",
"shì",
"zhèngfǔ",
"quánlì",
"de",
"jīchǔ",
",",
"zhè",
"yī",
"yìzhì",
"yīng",
"yǐ",
"dìngqī",
"de",
"hé",
"zhēnzhèng",
"de",
"xuǎnjǔ",
"yǔyǐ",
"biǎoxiàn",
"。",
"xiàtiān",
",",
"xiàngshì",
"hēng",
"zhe",
"xiǎoqū",
"de",
"shǎonián",
",",
"èzuòjù",
"bān",
"zài",
"dà",
"dìshàng",
"sǎmǎn",
"měi",
"yīzhǒng",
"cànlàn",
"de",
"yánsè",
"。",
" ",
"123",
" ",
"456",
"。",
];
#[cfg(not(feature = "chinese-normalization-pinyin"))]
const TOKENIZED: &[&str] = &[
"人人",
"生",
"而",
"自由",
",",
"在",
"尊",
"嚴",
"和",
"權",
"利",
"上",
"一律",
"平等",
"。",
"他",
"們",
"賦",
"有",
"理性",
"和",
"良心",
",",
"並",
"應",
"以",
"兄弟",
"關",
"係",
"的",
"精神",
"互相",
"對",
"待",
"。",
"人民",
"的",
"意志",
"是",
"政府",
"权力",
"的",
"基礎",
",",
"这",
"一",
"意志",
"應",
"以",
"定期",
"的",
"和",
"眞正",
"的",
"選舉",
"予以",
"表現",
"。",
"夏天",
",",
"像是",
"哼",
"着",
"小曲",
"的",
"少年",
",",
"惡作劇",
"般",
"在",
"大",
"地上",
"洒滿",
"每",
"一种",
"灿爛",
"的",
"顏色",
"。",
" ",
"123",
" ",
"456",
"。",
];
test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
#[test]
fn test_mix_number_and_letter() {
let seg = ChineseSegmenter;
let words: Vec<&str> = seg.segment_str("我从2025年开始学习Rust语言。").collect();
assert_eq!(words, vec!["我", "从", "2025", "年", "开始", "学习", "Rust", "语言", "。"]);
}
}