lance-index 4.0.1

Lance indices implementation
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::{fs::File, io::BufReader, path::Path, path::PathBuf};

use lance_core::{Error, Result};
use serde::{Deserialize, Serialize, de::DeserializeOwned};

#[derive(Serialize, Deserialize, Default)]
pub struct JiebaConfig {
    main: Option<String>,
    users: Option<Vec<String>>,
}

pub const JIEBA_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json";

pub trait JiebaTokenizerBuilder: Sized {
    type Config: DeserializeOwned + Default;

    fn load(p: &Path) -> Result<Self> {
        if !p.is_dir() {
            return Err(Error::invalid_input(format!(
                "Invalid directory path: {}",
                p.display()
            )));
        }
        let config_path = p.join(JIEBA_LANGUAGE_MODEL_CONFIG_FILE);
        let config = if config_path.exists() {
            let file = File::open(config_path)?;
            let reader = BufReader::new(file);
            serde_json::from_reader::<BufReader<File>, Self::Config>(reader)?
        } else {
            Self::Config::default()
        };
        Self::new(config, p)
    }

    fn new(config: Self::Config, root: &Path) -> Result<Self>;

    fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder>;
}

pub struct JiebaBuilder {
    root: PathBuf,
    config: JiebaConfig,
}

impl JiebaBuilder {
    fn main_dict_path(&self) -> PathBuf {
        if let Some(p) = &self.config.main {
            return self.root.join(p);
        }
        self.root.join("dict.txt")
    }

    fn user_dict_paths(&self) -> Vec<PathBuf> {
        let Some(users) = &self.config.users else {
            return vec![];
        };
        users.iter().map(|p| self.root.join(p)).collect()
    }
}

impl JiebaTokenizerBuilder for JiebaBuilder {
    type Config = JiebaConfig;

    fn new(config: Self::Config, root: &Path) -> Result<Self> {
        Ok(Self {
            config,
            root: root.to_path_buf(),
        })
    }

    fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
        let main_dict_path = &self.main_dict_path();
        let file = std::fs::File::open(main_dict_path)?;
        let mut f = std::io::BufReader::new(file);
        let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| {
            Error::invalid_input(format!(
                "Failed to load Jieba dictionary from {}: {}",
                main_dict_path.display(),
                e
            ))
        })?;
        for user_dict_path in &self.user_dict_paths() {
            let file = std::fs::File::open(user_dict_path)?;
            let mut f = std::io::BufReader::new(file);
            jieba.load_dict(&mut f).map_err(|e| {
                Error::invalid_input(format!(
                    "Failed to load Jieba user dictionary from {}: {}",
                    user_dict_path.display(),
                    e
                ))
            })?
        }
        let tokenizer = JiebaTokenizer { jieba };
        Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
    }
}

#[derive(Clone)]
struct JiebaTokenizer {
    jieba: jieba_rs::Jieba,
}

struct JiebaTokenStream {
    tokens: Vec<tantivy::tokenizer::Token>,
    index: usize,
}

impl tantivy::tokenizer::TokenStream for JiebaTokenStream {
    fn advance(&mut self) -> bool {
        if self.index < self.tokens.len() {
            self.index += 1;
            true
        } else {
            false
        }
    }

    fn token(&self) -> &tantivy::tokenizer::Token {
        &self.tokens[self.index - 1]
    }

    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
        &mut self.tokens[self.index - 1]
    }
}

#[cfg(feature = "tokenizer-jieba")]
impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
    type TokenStream<'a> = JiebaTokenStream;

    fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
        let mut indices = text.char_indices().collect::<Vec<_>>();
        indices.push((text.len(), '\0'));
        let orig_tokens = self
            .jieba
            .tokenize(text, jieba_rs::TokenizeMode::Search, true);
        let mut tokens = Vec::new();
        for token in orig_tokens {
            tokens.push(tantivy::tokenizer::Token {
                offset_from: indices[token.start].0,
                offset_to: indices[token.end].0,
                position: token.start,
                text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
                position_length: token.end - token.start,
            });
        }
        JiebaTokenStream { tokens, index: 0 }
    }
}