use std::num::NonZero;
use std::sync::LazyLock;
use fst::raw::Fst;
use crate::segmenter::utils::{BufferingStrategy, FstSegmenter};
use crate::segmenter::Segmenter;
pub struct ThaiSegmenter;
static WORDS_FST: LazyLock<Fst<&[u8]>> = LazyLock::new(|| {
Fst::new(&include_bytes!("../../dictionaries/fst/thai/words.fst")[..]).unwrap()
});
static FST_SEGMENTER: LazyLock<FstSegmenter> = LazyLock::new(|| {
FstSegmenter::new(
&WORDS_FST,
BufferingStrategy::UntilNextMatch { max_char_count: Some(NonZero::<usize>::MIN) },
)
});
impl Segmenter for ThaiSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
FST_SEGMENTER.segment_str(to_segment)
}
}
#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;
const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456";
const SEGMENTED: &[&str] = &[
"ภาษาไทย",
"ง่าย",
"นิดเดียว",
" ",
"ไก่",
"ขัน",
"ตอนเช้า",
"บน",
"ขันน้ำ",
" ",
"ฉัน",
"สระผม",
"ที่",
"สระน้ำ",
"ด้วย",
"น้ำยา",
"สระผม",
" ",
"123",
" ",
"456",
];
const TOKENIZED: &[&str] = &[
"ภาษาไทย",
"งาย",
"นดเดยว",
" ",
"ไก",
"ขน",
"ตอนเชา",
"บน",
"ขนนา",
" ",
"ฉน",
"สระผม",
"ท",
"สระนา",
"ดวย",
"นายา",
"สระผม",
" ",
"123",
" ",
"456",
];
test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);
}