use std::sync::LazyLock;
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use lindera::mode::{Mode, Penalty};
use lindera::segmenter::Segmenter as LinderaSegmenter;
use lindera::tokenizer::Tokenizer;
use crate::segmenter::Segmenter;
pub struct KoreanSegmenter;
static LINDERA: LazyLock<Tokenizer> = LazyLock::new(|| {
let dictionary = load_dictionary_from_kind(DictionaryKind::KoDic).unwrap();
let segmenter = LinderaSegmenter::new(Mode::Decompose(Penalty::default()), dictionary, None);
Tokenizer::new(segmenter)
});
impl Segmenter for KoreanSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
let tokens = LINDERA.tokenize(to_segment).unwrap();
let result: Vec<&'o str> = tokens
.into_iter()
.map(|token| {
let start = token.byte_start;
let end = token.byte_end;
&to_segment[start..end]
})
.collect();
Box::new(result.into_iter())
}
}
#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;
const TEXT: &str = "한국어의형태해석을실시할수있습니다 123 456.";
const SEGMENTED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];
const TOKENIZED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];
test_segmenter!(KoreanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Hangul, Language::Kor);
}