use super::bigram::tokenize_cjk;
pub fn segment(text: &str, lang: &str) -> Vec<String> {
match lang {
"ja" | "japanese" => segment_japanese(text),
"zh" | "chinese" => segment_chinese(text),
"ko" | "korean" => segment_korean(text),
"th" | "thai" => segment_thai(text),
_ => tokenize_cjk(text),
}
}
fn segment_japanese(text: &str) -> Vec<String> {
#[cfg(feature = "lang-ja")]
{
lindera_segment(text, "ipadic")
}
#[cfg(not(feature = "lang-ja"))]
{
tokenize_cjk(text)
}
}
fn segment_chinese(text: &str) -> Vec<String> {
tokenize_cjk(text)
}
fn segment_korean(text: &str) -> Vec<String> {
#[cfg(feature = "lang-ko")]
{
lindera_segment(text, "ko-dic")
}
#[cfg(not(feature = "lang-ko"))]
{
tokenize_cjk(text)
}
}
fn segment_thai(text: &str) -> Vec<String> {
#[cfg(feature = "lang-th")]
{
icu_segment_thai(text)
}
#[cfg(not(feature = "lang-th"))]
{
tokenize_cjk(text)
}
}
#[cfg(feature = "lang-ja")]
fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
use lindera::tokenizer::TokenizerBuilder;
let Ok(tokenizer) = TokenizerBuilder::new().and_then(|b| b.build()) else {
return tokenize_cjk(text);
};
let Ok(tokens) = tokenizer.tokenize(text) else {
return tokenize_cjk(text);
};
tokens
.into_iter()
.map(|t| t.surface.to_string())
.filter(|t: &String| t.len() > 1 || t.chars().next().is_some_and(super::script::is_cjk))
.collect()
}
#[cfg(feature = "lang-th")]
fn icu_segment_thai(text: &str) -> Vec<String> {
use icu_segmenter::WordSegmenter;
let segmenter = WordSegmenter::new_auto();
let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
let mut words = Vec::new();
for window in breakpoints.windows(2) {
let word = &text[window[0]..window[1]];
if !word.trim().is_empty() {
words.push(word.to_string());
}
}
words
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bigrams_chinese() {
let tokens = segment("全文検索", "zh");
assert_eq!(tokens, vec!["全文", "文検", "検索"]);
}
#[test]
#[cfg(not(feature = "lang-ja"))]
fn fallback_to_bigrams_japanese() {
let tokens = segment("東京タワー", "ja");
assert!(!tokens.is_empty());
}
#[test]
#[cfg(feature = "lang-ja")]
fn dictionary_segmentation_japanese() {
let tokens = segment("東京タワー", "ja");
assert!(!tokens.is_empty());
}
#[test]
#[cfg(not(feature = "lang-ko"))]
fn fallback_to_bigrams_korean() {
let tokens = segment("한국어", "ko");
assert!(!tokens.is_empty());
}
#[test]
#[cfg(feature = "lang-ko")]
fn dictionary_segmentation_korean() {
let tokens = segment("한국어", "ko");
assert!(!tokens.is_empty());
}
#[test]
fn unknown_lang_fallback() {
let tokens = segment("全文検索", "unknown");
assert_eq!(tokens, vec!["全文", "文検", "検索"]);
}
}