memory-indexer 0.3.0

An in-memory full-text fuzzy search indexer.
Documentation
use std::collections::{HashMap, HashSet};

use super::{
    super::tokenizer::{
        DictionarySegmenter, TextNormalizer, TextNormalizerRef, normalize_query, normalize_term,
        script_runs, tokenize_chinese, tokenize_hangul, tokenize_japanese,
    },
    DictionaryConfig, Segment, SegmentScript, TokenDraft,
};

pub trait ScriptTokenizerStrategy {
    fn tokenize(
        &self,
        segment: &Segment<'_>,
        normalizer: &dyn TextNormalizer,
        dictionary: Option<&DictionarySegmenter>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    );
}

pub struct DefaultScriptSegmenter;

impl DefaultScriptSegmenter {
    pub fn segment<'a>(&self, text: &'a str) -> Vec<Segment<'a>> {
        script_runs(text)
            .into_iter()
            .map(|(script, start, end)| Segment {
                script,
                text: &text[start..end],
                offset: start,
            })
            .collect()
    }
}

pub struct HanTokenizer;

impl ScriptTokenizerStrategy for HanTokenizer {
    fn tokenize(
        &self,
        segment: &Segment<'_>,
        normalizer: &dyn TextNormalizer,
        _dictionary: Option<&DictionarySegmenter>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    ) {
        let mut tokens = Vec::new();
        tokenize_chinese(
            segment.text,
            segment.offset,
            segment.script,
            normalizer,
            &mut tokens,
            seen,
        );
        out.extend(tokens.into_iter().map(TokenDraft::from));
    }
}

pub struct KanaTokenizer;

impl ScriptTokenizerStrategy for KanaTokenizer {
    fn tokenize(
        &self,
        segment: &Segment<'_>,
        normalizer: &dyn TextNormalizer,
        dictionary: Option<&DictionarySegmenter>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    ) {
        let mut tokens = Vec::new();
        tokenize_japanese(
            segment.text,
            segment.offset,
            segment.script,
            normalizer,
            dictionary,
            &mut tokens,
            seen,
        );
        out.extend(tokens.into_iter().map(TokenDraft::from));
    }
}

pub struct HangulTokenizer;

impl ScriptTokenizerStrategy for HangulTokenizer {
    fn tokenize(
        &self,
        segment: &Segment<'_>,
        normalizer: &dyn TextNormalizer,
        dictionary: Option<&DictionarySegmenter>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    ) {
        let mut tokens = Vec::new();
        tokenize_hangul(
            segment.text,
            segment.offset,
            segment.script,
            normalizer,
            dictionary,
            &mut tokens,
            seen,
        );
        out.extend(tokens.into_iter().map(TokenDraft::from));
    }
}

pub struct LatinOtherTokenizer;

impl ScriptTokenizerStrategy for LatinOtherTokenizer {
    fn tokenize(
        &self,
        segment: &Segment<'_>,
        normalizer: &dyn TextNormalizer,
        _dictionary: Option<&DictionarySegmenter>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    ) {
        let mut tokens = Vec::new();
        normalizer.normalize(
            segment.text,
            segment.offset,
            segment.script,
            &mut tokens,
            seen,
        );
        out.extend(tokens.into_iter().map(TokenDraft::from));
    }
}

pub struct DefaultTokenizer {
    pub(crate) normalizer: TextNormalizerRef,
    pub(crate) tokenizers: HashMap<SegmentScript, Box<dyn ScriptTokenizerStrategy>>,
    pub(crate) fallback: Box<dyn ScriptTokenizerStrategy>,
    pub(crate) dictionary: Option<DictionarySegmenter>,
}

impl DefaultTokenizer {
    pub fn for_documents() -> Self {
        Self::with_default_scripts(normalize_term())
    }

    pub fn for_queries() -> Self {
        Self::with_default_scripts(normalize_query())
    }

    fn with_default_scripts(normalizer: TextNormalizerRef) -> Self {
        Self::new(normalizer, Box::new(LatinOtherTokenizer))
            .register_script_tokenizer(SegmentScript::Han, HanTokenizer)
            .register_script_tokenizer(SegmentScript::Hiragana, KanaTokenizer)
            .register_script_tokenizer(SegmentScript::Katakana, KanaTokenizer)
            .register_script_tokenizer(SegmentScript::Hangul, HangulTokenizer)
    }

    pub fn new(normalizer: TextNormalizerRef, fallback: Box<dyn ScriptTokenizerStrategy>) -> Self {
        Self {
            normalizer,
            tokenizers: HashMap::new(),
            fallback,
            dictionary: None,
        }
    }

    pub fn register_script_tokenizer<T>(mut self, script: SegmentScript, tokenizer: T) -> Self
    where
        T: ScriptTokenizerStrategy + 'static,
    {
        self.tokenizers.insert(script, Box::new(tokenizer));
        self
    }

    pub fn with_dictionary(mut self, config: DictionaryConfig) -> Self {
        self.dictionary = Some(DictionarySegmenter::new(config));
        self
    }

    pub fn tokenize_segment(
        &self,
        segment: &Segment<'_>,
        out: &mut Vec<TokenDraft>,
        seen: &mut HashSet<(String, usize, usize)>,
    ) {
        if let Some(tokenizer) = self.tokenizers.get(&segment.script) {
            tokenizer.tokenize(
                segment,
                self.normalizer.as_ref(),
                self.dictionary.as_ref(),
                out,
                seen,
            );
        } else {
            self.fallback.tokenize(
                segment,
                self.normalizer.as_ref(),
                self.dictionary.as_ref(),
                out,
                seen,
            );
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizer::{OffsetMap, dictionary::ScriptDictionary, normalize_term};
    use std::collections::HashSet;

    struct FallbackTokenizer;

    impl ScriptTokenizerStrategy for FallbackTokenizer {
        fn tokenize(
            &self,
            segment: &Segment<'_>,
            _normalizer: &dyn TextNormalizer,
            _dictionary: Option<&DictionarySegmenter>,
            out: &mut Vec<TokenDraft>,
            _seen: &mut HashSet<(String, usize, usize)>,
        ) {
            out.push(TokenDraft {
                text: "fallback".to_string(),
                span: (segment.offset, segment.offset + segment.text.len()),
                script: segment.script,
                mapping: OffsetMap::identity((segment.offset, segment.offset + segment.text.len())),
            });
        }
    }

    #[test]
    fn allows_injecting_dictionary_config() {
        let config = DictionaryConfig {
            japanese: Some(ScriptDictionary {
                version: Some("v1".to_string()),
                entries: std::collections::HashSet::new(),
            }),
            hangul: None,
        };
        let tokenizer = DefaultTokenizer::for_documents().with_dictionary(config);
        assert!(tokenizer.dictionary.is_some());
    }

    #[test]
    fn falls_back_to_default_tokenizer_when_script_not_registered() {
        let tokenizer = DefaultTokenizer::new(normalize_term(), Box::new(FallbackTokenizer));

        let segment = Segment {
            script: SegmentScript::Other,
            text: "abc",
            offset: 0,
        };

        let mut out = Vec::new();
        let mut seen = HashSet::new();
        tokenizer.tokenize_segment(&segment, &mut out, &mut seen);

        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "fallback");
    }
}