charabia 0.9.9

A simple library to detect the language, tokenize the text and normalize the tokens
Documentation
use std::sync::LazyLock;

use jieba_rs::Jieba;

use crate::segmenter::Segmenter;

/// Chinese Script specialized [`Segmenter`].
///
/// This Segmenter uses [`Jieba`] internally to segment the provided text
/// without HMM feature.
pub struct ChineseSegmenter;

fn next_gram<const N: usize>(s: &str) -> Option<&str> {
    match s.char_indices().nth(N - 1) {
        Some((byte_index, c)) => Some(&s[0..(byte_index + c.len_utf8())]),
        None => None,
    }
}

fn cut_for_search<'a>(s: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
    if s.chars().count() <= 2 {
        return Box::new(std::iter::once(s));
    }
    if s.chars().all(|c| c.is_ascii_alphanumeric()) {
        return Box::new(std::iter::once(s));
    }
    let mut subwords = Vec::new();
    let mut index = 0;
    loop {
        if let Some(bigram) = next_gram::<2>(&s[index..]).filter(|sub| JIEBA.has_word(sub)) {
            // valid bigram, register it and advance by two characters.
            // greedy thinking: do bigram first, maybe we can get more words
            index += bigram.len();
            subwords.push(bigram);
        } else if let Some(trigram) = next_gram::<3>(&s[index..]).filter(|sub| JIEBA.has_word(sub))
        {
            // valid trigram, register it and advance by three characters.
            index += trigram.len();
            subwords.push(trigram);
        } else if let Some(single) = next_gram::<1>(&s[index..]) {
            //Register the character and advance by one character.
            index += single.len();
            subwords.push(single);
        } else {
            // no more character, stop.
            break;
        }
    }
    Box::new(subwords.into_iter())
}

impl Segmenter for ChineseSegmenter {
    fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
        let segmented: Vec<&str> = JIEBA
            .cut(to_segment, false) // disable Hidden Markov Models.
            .into_iter()
            .flat_map(|x| cut_for_search(x))
            .collect();
        Box::new(segmented.into_iter())
    }
}

static JIEBA: LazyLock<Jieba> = LazyLock::new(Jieba::new);

#[cfg(test)]
mod test {
    use crate::segmenter::test::test_segmenter;

    // Original version of the text.
    const TEXT: &str =
        "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。人民的意志是政府权力的基础,这一意志应以定期的和真正的选举予以表现。夏天,像是哼着小曲的少年,恶作剧般在大地上洒满每一种灿烂的颜色。 123 456。";

    // Segmented version of the text.
    const SEGMENTED: &[&str] = &[
        "人人",
        "",
        "",
        "自由",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "一律",
        "平等",
        "",
        "",
        "",
        "",
        "",
        "理性",
        "",
        "良心",
        "",
        "",
        "",
        "",
        "兄弟",
        "",
        "",
        "",
        "精神",
        "互相",
        "",
        "",
        "",
        "人民",
        "",
        "意志",
        "",
        "政府",
        "权力",
        "",
        "基础",
        "",
        "",
        "",
        "意志",
        "",
        "",
        "定期",
        "",
        "",
        "真正",
        "",
        "选举",
        "予以",
        "表现",
        "",
        "夏天",
        "",
        "像是",
        "",
        "",
        "小曲",
        "",
        "少年",
        "",
        "恶作剧",
        "",
        "",
        "",
        "地上",
        "洒满",
        "",
        "一种",
        "灿烂",
        "",
        "颜色",
        "",
        " ",
        "123",
        " ",
        "456",
        "",
    ];

    // Segmented and normalized version of the text.
    #[cfg(feature = "chinese-normalization-pinyin")]
    const TOKENIZED: &[&str] = &[
        "rénrén",
        "shēng",
        "ér",
        "zìyóu",
        ",",
        "zài",
        "zūn",
        "yán",
        "",
        "quán",
        "",
        "shàng",
        "yīlǜ",
        "píngděng",
        "",
        "",
        "men",
        "",
        "yǒu",
        "lǐxìng",
        "",
        "liángxīn",
        ",",
        "bìng",
        "yīng",
        "",
        "xiōngdì",
        "guān",
        "",
        "de",
        "jīngshén",
        "hùxiāng",
        "duì",
        "dài",
        "",
        "rénmín",
        "de",
        "yìzhì",
        "shì",
        "zhèngfǔ",
        "quánlì",
        "de",
        "jīchǔ",
        ",",
        "zhè",
        "",
        "yìzhì",
        "yīng",
        "",
        "dìngqī",
        "de",
        "",
        "zhēnzhèng",
        "de",
        "xuǎnjǔ",
        "yǔyǐ",
        "biǎoxiàn",
        "",
        "xiàtiān",
        ",",
        "xiàngshì",
        "hēng",
        "zhe",
        "xiǎoqū",
        "de",
        "shǎonián",
        ",",
        "èzuòjù",
        "bān",
        "zài",
        "",
        "dìshàng",
        "sǎmǎn",
        "měi",
        "yīzhǒng",
        "cànlàn",
        "de",
        "yánsè",
        "",
        " ",
        "123",
        " ",
        "456",
        "",
    ];

    #[cfg(not(feature = "chinese-normalization-pinyin"))]
    const TOKENIZED: &[&str] = &[
        "人人",
        "",
        "",
        "自由",
        ",",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "一律",
        "平等",
        "",
        "",
        "",
        "",
        "",
        "理性",
        "",
        "良心",
        ",",
        "",
        "",
        "",
        "兄弟",
        "",
        "",
        "",
        "精神",
        "互相",
        "",
        "",
        "",
        "人民",
        "",
        "意志",
        "",
        "政府",
        "权力",
        "",
        "基礎",
        ",",
        "",
        "",
        "意志",
        "",
        "",
        "定期",
        "",
        "",
        "眞正",
        "",
        "選舉",
        "予以",
        "表現",
        "",
        "夏天",
        ",",
        "像是",
        "",
        "",
        "小曲",
        "",
        "少年",
        ",",
        "惡作劇",
        "",
        "",
        "",
        "地上",
        "洒滿",
        "",
        "一种",
        "灿爛",
        "",
        "顏色",
        "",
        " ",
        "123",
        " ",
        "456",
        "",
    ];

    // Macro that run several tests on the Segmenter.
    test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);

    #[test]
    fn test_mix_number_and_letter() {
        let seg = ChineseSegmenter;
        let words: Vec<&str> = seg.segment_str("我从2025年开始学习Rust语言。").collect();
        assert_eq!(words, vec!["", "", "2025", "", "开始", "学习", "Rust", "语言", ""]);
    }
}