charabia 0.9.9

A simple library to detect the language, tokenize the text and normalize the tokens
Documentation
use std::num::NonZero;
use std::sync::LazyLock;

use fst::raw::Fst;

// Import `Segmenter` trait.
use crate::segmenter::utils::{BufferingStrategy, FstSegmenter};
use crate::segmenter::Segmenter;

/// Thai specialized [`Segmenter`].
///
/// This Segmenter uses a dictionary encoded as an FST to segment the provided text.
/// Dictionary source: PyThaiNLP project on https://github.com/PyThaiNLP/nlpo3
pub struct ThaiSegmenter;

static WORDS_FST: LazyLock<Fst<&[u8]>> = LazyLock::new(|| {
    Fst::new(&include_bytes!("../../dictionaries/fst/thai/words.fst")[..]).unwrap()
});

static FST_SEGMENTER: LazyLock<FstSegmenter> = LazyLock::new(|| {
    // max char count of 1, so the segmenter will buffer the characters 1 by 1 or until the next match is found
    FstSegmenter::new(
        &WORDS_FST,
        BufferingStrategy::UntilNextMatch { max_char_count: Some(NonZero::<usize>::MIN) },
    )
});

impl Segmenter for ThaiSegmenter {
    fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
        FST_SEGMENTER.segment_str(to_segment)
    }
}

// Test the segmenter:
#[cfg(test)]
mod test {
    use crate::segmenter::test::test_segmenter;

    const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456";

    const SEGMENTED: &[&str] = &[
        "ภาษาไทย",
        "ง่าย",
        "นิดเดียว",
        " ",
        "ไก่",
        "ขัน",
        "ตอนเช้า",
        "บน",
        "ขันน้ำ",
        " ",
        "ฉัน",
        "สระผม",
        "ที่",
        "สระน้ำ",
        "ด้วย",
        "น้ำยา",
        "สระผม",
        " ",
        "123",
        " ",
        "456",
    ];

    const TOKENIZED: &[&str] = &[
        "ภาษาไทย",
        "งาย",
        "นดเดยว",
        " ",
        "ไก",
        "ขน",
        "ตอนเชา",
        "บน",
        "ขนนา",
        " ",
        "ฉน",
        "สระผม",
        "",
        "สระนา",
        "ดวย",
        "นายา",
        "สระผม",
        " ",
        "123",
        " ",
        "456",
    ];
    // Macro that run several tests on the Segmenter.
    test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);
}