use dictx_core::{DictEntry, DictxError, Result};
use tantivy::schema::{
Field, NumericOptions, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT,
};
use tantivy::TantivyDocument;
use crate::{expand_for_search, EntryLocator};
pub const SCHEMA_VERSION: u32 = 2;
#[derive(Debug, Clone)]
pub struct DictxSchema {
pub schema: Schema,
pub id: Field,
pub word: Field,
pub word_lower: Field,
pub source: Field,
pub pos: Field,
pub tag: Field,
pub phonetic: Field,
pub definition: Field,
pub examples: Field,
pub phrases: Field,
pub search_text: Field,
pub freq_bnc: Field,
pub freq_coca: Field,
pub collins: Field,
pub oxford: Field,
pub entry_offset: Option<Field>,
pub entry_len: Option<Field>,
pub raw_json: Option<Field>,
}
impl DictxSchema {
pub fn build() -> Self {
let mut builder = Schema::builder();
let id = builder.add_text_field("id", STRING | STORED);
let word = builder.add_text_field("word", STRING | STORED);
let word_lower = builder.add_text_field("word_lower", STRING | STORED);
let source = builder.add_text_field("source", STRING | STORED);
let pos = builder.add_text_field("pos", STRING | STORED);
let tag = builder.add_text_field("tag", STRING | STORED);
let phonetic = builder.add_text_field("phonetic", TEXT | STORED);
let definition = builder.add_text_field("definition", TEXT | STORED);
let examples = builder.add_text_field("examples", TEXT | STORED);
let phrases = builder.add_text_field("phrases", TEXT | STORED);
let search_text = builder.add_text_field("search_text", text_with_positions());
let numeric = NumericOptions::default()
.set_indexed()
.set_stored()
.set_fast();
let freq_bnc = builder.add_u64_field("freq_bnc", numeric.clone());
let freq_coca = builder.add_u64_field("freq_coca", numeric.clone());
let collins = builder.add_u64_field("collins", numeric.clone());
let oxford = builder.add_u64_field("oxford", numeric);
let stored_u64 = NumericOptions::default().set_stored();
let entry_offset = builder.add_u64_field("entry_offset", stored_u64.clone());
let entry_len = builder.add_u64_field("entry_len", stored_u64);
let schema = builder.build();
Self {
schema,
id,
word,
word_lower,
source,
pos,
tag,
phonetic,
definition,
examples,
phrases,
search_text,
freq_bnc,
freq_coca,
collins,
oxford,
entry_offset: Some(entry_offset),
entry_len: Some(entry_len),
raw_json: None,
}
}
pub fn from_schema(schema: Schema) -> Result<Self> {
Ok(Self {
id: field(&schema, "id")?,
word: field(&schema, "word")?,
word_lower: field(&schema, "word_lower")?,
source: field(&schema, "source")?,
pos: field(&schema, "pos")?,
tag: field(&schema, "tag")?,
phonetic: field(&schema, "phonetic")?,
definition: field(&schema, "definition")?,
examples: field(&schema, "examples")?,
phrases: field(&schema, "phrases")?,
search_text: field(&schema, "search_text")?,
freq_bnc: field(&schema, "freq_bnc")?,
freq_coca: field(&schema, "freq_coca")?,
collins: field(&schema, "collins")?,
oxford: field(&schema, "oxford")?,
entry_offset: field_opt(&schema, "entry_offset"),
entry_len: field_opt(&schema, "entry_len"),
raw_json: field_opt(&schema, "raw_json"),
schema,
})
}
pub fn to_document(
&self,
entry: &DictEntry,
locator: Option<EntryLocator>,
) -> Result<TantivyDocument> {
let mut doc = TantivyDocument::default();
doc.add_text(self.id, &entry.id);
doc.add_text(self.word, &entry.word);
doc.add_text(self.word_lower, &entry.word_lower);
doc.add_text(self.source, &entry.source.slug());
for pos in &entry.pos {
doc.add_text(self.pos, pos);
}
for tag in &entry.tags {
doc.add_text(self.tag, tag);
}
let phonetic = [entry.phonetic_uk.as_deref(), entry.phonetic_us.as_deref()]
.into_iter()
.flatten()
.collect::<Vec<_>>()
.join(" ");
let definition = entry
.definitions
.iter()
.flat_map(|definition| [&definition.zh, &definition.en])
.cloned()
.collect::<Vec<_>>()
.join(" ");
let examples = entry
.examples
.iter()
.flat_map(|example| [&example.zh, &example.en])
.cloned()
.collect::<Vec<_>>()
.join(" ");
let phrases = entry
.phrases
.iter()
.flat_map(|phrase| [&phrase.zh, &phrase.en])
.cloned()
.collect::<Vec<_>>()
.join(" ");
doc.add_text(self.phonetic, &phonetic);
doc.add_text(self.definition, &definition);
doc.add_text(self.examples, &examples);
doc.add_text(self.phrases, &phrases);
doc.add_text(self.search_text, &expand_for_search(&entry.all_text()));
doc.add_u64(self.freq_bnc, entry.freq_bnc.unwrap_or(u32::MAX) as u64);
doc.add_u64(self.freq_coca, entry.freq_coca.unwrap_or(u32::MAX) as u64);
doc.add_u64(self.collins, entry.collins_star as u64);
doc.add_u64(self.oxford, u64::from(entry.oxford_3000));
if let (Some(locator), Some(offset_field), Some(len_field)) =
(locator, self.entry_offset, self.entry_len)
{
doc.add_u64(offset_field, locator.offset);
doc.add_u64(len_field, locator.len);
}
if let Some(raw_json) = self.raw_json {
doc.add_text(raw_json, &serde_json::to_string(entry)?);
}
Ok(doc)
}
}
fn text_with_positions() -> TextOptions {
let indexing = TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
TextOptions::default().set_indexing_options(indexing)
}
fn field(schema: &Schema, name: &str) -> Result<Field> {
schema
.get_field(name)
.map_err(|_| DictxError::InvalidData(format!("索引缺少字段: {name}")))
}
fn field_opt(schema: &Schema, name: &str) -> Option<Field> {
schema.get_field(name).ok()
}