nodedb-fts 0.0.0

Shared full-text search engine for NodeDB — inverted index, BM25 scoring, analyzers, fuzzy matching
Documentation
//! Feature-gated dictionary-based segmentation dispatch.
//!
//! When the corresponding feature is enabled, uses dictionary segmentation
//! instead of bigrams for that language. Falls back to bigram when the
//! feature is disabled.
//!
//! Feature gates:
//! - `lang-ja`: lindera with IPADIC for Japanese
//! - `lang-zh`: jieba-rs for Chinese
//! - `lang-ko`: lindera with ko-dic for Korean
//! - `lang-th`: icu_segmenter for Thai

use super::bigram::tokenize_cjk;

/// Segment text using the best available method for the given language.
///
/// Falls back to CJK bigrams if no dictionary is available.
pub fn segment(text: &str, lang: &str) -> Vec<String> {
    match lang {
        "ja" | "japanese" => segment_japanese(text),
        "zh" | "chinese" => segment_chinese(text),
        "ko" | "korean" => segment_korean(text),
        "th" | "thai" => segment_thai(text),
        _ => tokenize_cjk(text),
    }
}

/// Japanese segmentation: lindera/IPADIC when `lang-ja` is enabled, bigrams otherwise.
fn segment_japanese(text: &str) -> Vec<String> {
    #[cfg(feature = "lang-ja")]
    {
        lindera_segment(text, "ipadic")
    }
    #[cfg(not(feature = "lang-ja"))]
    {
        tokenize_cjk(text)
    }
}

/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise.
fn segment_chinese(text: &str) -> Vec<String> {
    #[cfg(feature = "lang-zh")]
    {
        jieba_segment(text)
    }
    #[cfg(not(feature = "lang-zh"))]
    {
        tokenize_cjk(text)
    }
}

/// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise.
fn segment_korean(text: &str) -> Vec<String> {
    #[cfg(feature = "lang-ko")]
    {
        lindera_segment(text, "ko-dic")
    }
    #[cfg(not(feature = "lang-ko"))]
    {
        tokenize_cjk(text)
    }
}

/// Thai segmentation: icu_segmenter when `lang-th` is enabled, bigrams otherwise.
fn segment_thai(text: &str) -> Vec<String> {
    #[cfg(feature = "lang-th")]
    {
        icu_segment_thai(text)
    }
    #[cfg(not(feature = "lang-th"))]
    {
        // Thai bigram fallback (same strategy as CJK).
        tokenize_cjk(text)
    }
}

// ── Feature-gated implementations ──────────────────────────────────

#[cfg(feature = "lang-ja")]
fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
    use lindera::tokenizer::TokenizerBuilder;
    let Ok(tokenizer) = TokenizerBuilder::new().and_then(|b| b.build()) else {
        return tokenize_cjk(text);
    };
    let Ok(tokens) = tokenizer.tokenize(text) else {
        return tokenize_cjk(text);
    };
    tokens
        .into_iter()
        .map(|t| t.surface.to_string())
        .filter(|t: &String| t.len() > 1 || t.chars().next().is_some_and(super::script::is_cjk))
        .collect()
}

#[cfg(feature = "lang-zh")]
fn jieba_segment(text: &str) -> Vec<String> {
    use jieba_rs::Jieba;
    let jieba = Jieba::new();
    jieba
        .cut(text, false)
        .into_iter()
        .map(|s| s.to_string())
        .filter(|s| !s.trim().is_empty())
        .collect()
}

#[cfg(feature = "lang-th")]
fn icu_segment_thai(text: &str) -> Vec<String> {
    use icu_segmenter::WordSegmenter;
    let segmenter = WordSegmenter::new_auto();
    let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
    let mut words = Vec::new();
    for window in breakpoints.windows(2) {
        let word = &text[window[0]..window[1]];
        if !word.trim().is_empty() {
            words.push(word.to_string());
        }
    }
    words
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    #[cfg(not(feature = "lang-zh"))]
    fn fallback_to_bigrams_chinese() {
        // Without lang-zh feature, should use CJK bigrams.
        let tokens = segment("全文検索", "zh");
        assert_eq!(tokens, vec!["全文", "文検", "検索"]);
    }

    #[test]
    #[cfg(feature = "lang-zh")]
    fn dictionary_segmentation_chinese() {
        // With lang-zh feature, jieba produces dictionary-based tokens.
        let tokens = segment("全文検索", "zh");
        assert!(!tokens.is_empty());
    }

    #[test]
    #[cfg(not(feature = "lang-ja"))]
    fn fallback_to_bigrams_japanese() {
        let tokens = segment("東京タワー", "ja");
        assert!(!tokens.is_empty());
    }

    #[test]
    #[cfg(feature = "lang-ja")]
    fn dictionary_segmentation_japanese() {
        let tokens = segment("東京タワー", "ja");
        assert!(!tokens.is_empty());
    }

    #[test]
    #[cfg(not(feature = "lang-ko"))]
    fn fallback_to_bigrams_korean() {
        let tokens = segment("한국어", "ko");
        assert!(!tokens.is_empty());
    }

    #[test]
    #[cfg(feature = "lang-ko")]
    fn dictionary_segmentation_korean() {
        let tokens = segment("한국어", "ko");
        assert!(!tokens.is_empty());
    }

    #[test]
    fn unknown_lang_fallback() {
        let tokens = segment("全文検索", "unknown");
        assert_eq!(tokens, vec!["全文", "文検", "検索"]);
    }
}