dictx-index 0.1.0

Index builder and binary entry storage for DictX.
Documentation
use dictx_core::{DictEntry, DictxError, Result};
use tantivy::schema::{
    Field, NumericOptions, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT,
};
use tantivy::TantivyDocument;

use crate::{expand_for_search, EntryLocator};

pub const SCHEMA_VERSION: u32 = 2;

#[derive(Debug, Clone)]
pub struct DictxSchema {
    pub schema: Schema,
    pub id: Field,
    pub word: Field,
    pub word_lower: Field,
    pub source: Field,
    pub pos: Field,
    pub tag: Field,
    pub phonetic: Field,
    pub definition: Field,
    pub examples: Field,
    pub phrases: Field,
    pub search_text: Field,
    pub freq_bnc: Field,
    pub freq_coca: Field,
    pub collins: Field,
    pub oxford: Field,
    pub entry_offset: Option<Field>,
    pub entry_len: Option<Field>,
    pub raw_json: Option<Field>,
}

impl DictxSchema {
    pub fn build() -> Self {
        let mut builder = Schema::builder();

        let id = builder.add_text_field("id", STRING | STORED);
        let word = builder.add_text_field("word", STRING | STORED);
        let word_lower = builder.add_text_field("word_lower", STRING | STORED);
        let source = builder.add_text_field("source", STRING | STORED);
        let pos = builder.add_text_field("pos", STRING | STORED);
        let tag = builder.add_text_field("tag", STRING | STORED);
        let phonetic = builder.add_text_field("phonetic", TEXT | STORED);
        let definition = builder.add_text_field("definition", TEXT | STORED);
        let examples = builder.add_text_field("examples", TEXT | STORED);
        let phrases = builder.add_text_field("phrases", TEXT | STORED);
        let search_text = builder.add_text_field("search_text", text_with_positions());

        let numeric = NumericOptions::default()
            .set_indexed()
            .set_stored()
            .set_fast();
        let freq_bnc = builder.add_u64_field("freq_bnc", numeric.clone());
        let freq_coca = builder.add_u64_field("freq_coca", numeric.clone());
        let collins = builder.add_u64_field("collins", numeric.clone());
        let oxford = builder.add_u64_field("oxford", numeric);
        let stored_u64 = NumericOptions::default().set_stored();
        let entry_offset = builder.add_u64_field("entry_offset", stored_u64.clone());
        let entry_len = builder.add_u64_field("entry_len", stored_u64);

        let schema = builder.build();
        Self {
            schema,
            id,
            word,
            word_lower,
            source,
            pos,
            tag,
            phonetic,
            definition,
            examples,
            phrases,
            search_text,
            freq_bnc,
            freq_coca,
            collins,
            oxford,
            entry_offset: Some(entry_offset),
            entry_len: Some(entry_len),
            raw_json: None,
        }
    }

    pub fn from_schema(schema: Schema) -> Result<Self> {
        Ok(Self {
            id: field(&schema, "id")?,
            word: field(&schema, "word")?,
            word_lower: field(&schema, "word_lower")?,
            source: field(&schema, "source")?,
            pos: field(&schema, "pos")?,
            tag: field(&schema, "tag")?,
            phonetic: field(&schema, "phonetic")?,
            definition: field(&schema, "definition")?,
            examples: field(&schema, "examples")?,
            phrases: field(&schema, "phrases")?,
            search_text: field(&schema, "search_text")?,
            freq_bnc: field(&schema, "freq_bnc")?,
            freq_coca: field(&schema, "freq_coca")?,
            collins: field(&schema, "collins")?,
            oxford: field(&schema, "oxford")?,
            entry_offset: field_opt(&schema, "entry_offset"),
            entry_len: field_opt(&schema, "entry_len"),
            raw_json: field_opt(&schema, "raw_json"),
            schema,
        })
    }

    pub fn to_document(
        &self,
        entry: &DictEntry,
        locator: Option<EntryLocator>,
    ) -> Result<TantivyDocument> {
        let mut doc = TantivyDocument::default();
        doc.add_text(self.id, &entry.id);
        doc.add_text(self.word, &entry.word);
        doc.add_text(self.word_lower, &entry.word_lower);
        doc.add_text(self.source, &entry.source.slug());

        for pos in &entry.pos {
            doc.add_text(self.pos, pos);
        }
        for tag in &entry.tags {
            doc.add_text(self.tag, tag);
        }

        let phonetic = [entry.phonetic_uk.as_deref(), entry.phonetic_us.as_deref()]
            .into_iter()
            .flatten()
            .collect::<Vec<_>>()
            .join(" ");
        let definition = entry
            .definitions
            .iter()
            .flat_map(|definition| [&definition.zh, &definition.en])
            .cloned()
            .collect::<Vec<_>>()
            .join(" ");
        let examples = entry
            .examples
            .iter()
            .flat_map(|example| [&example.zh, &example.en])
            .cloned()
            .collect::<Vec<_>>()
            .join(" ");
        let phrases = entry
            .phrases
            .iter()
            .flat_map(|phrase| [&phrase.zh, &phrase.en])
            .cloned()
            .collect::<Vec<_>>()
            .join(" ");

        doc.add_text(self.phonetic, &phonetic);
        doc.add_text(self.definition, &definition);
        doc.add_text(self.examples, &examples);
        doc.add_text(self.phrases, &phrases);
        doc.add_text(self.search_text, &expand_for_search(&entry.all_text()));
        doc.add_u64(self.freq_bnc, entry.freq_bnc.unwrap_or(u32::MAX) as u64);
        doc.add_u64(self.freq_coca, entry.freq_coca.unwrap_or(u32::MAX) as u64);
        doc.add_u64(self.collins, entry.collins_star as u64);
        doc.add_u64(self.oxford, u64::from(entry.oxford_3000));

        if let (Some(locator), Some(offset_field), Some(len_field)) =
            (locator, self.entry_offset, self.entry_len)
        {
            doc.add_u64(offset_field, locator.offset);
            doc.add_u64(len_field, locator.len);
        }

        if let Some(raw_json) = self.raw_json {
            doc.add_text(raw_json, &serde_json::to_string(entry)?);
        }

        Ok(doc)
    }
}

fn text_with_positions() -> TextOptions {
    let indexing = TextFieldIndexing::default()
        .set_tokenizer("default")
        .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
    TextOptions::default().set_indexing_options(indexing)
}

fn field(schema: &Schema, name: &str) -> Result<Field> {
    schema
        .get_field(name)
        .map_err(|_| DictxError::InvalidData(format!("索引缺少字段: {name}")))
}

fn field_opt(schema: &Schema, name: &str) -> Option<Field> {
    schema.get_field(name).ok()
}