use dictx_core::{
clean_pos, normalize_tag, DictEntry, DictSource, DictxError, Query, Result, SearchFilters,
SearchRequest,
};
use dictx_index::{
expand_for_search, open_index, tantivy_error, DictxSchema, EntryLocator, EntryPackReader,
ENTRY_PACK_FILE,
};
use std::cmp::Ordering;
use std::collections::BTreeSet;
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;
use tantivy::collector::TopDocs;
use tantivy::query::{
AllQuery, BooleanQuery, BoostQuery, FuzzyTermQuery, Query as TantivyQuery, QueryParser,
RegexQuery, TermQuery,
};
use tantivy::schema::IndexRecordOption;
use tantivy::schema::Value;
use tantivy::{Index, IndexReader, ReloadPolicy, TantivyDocument, Term};
#[derive(Debug, Clone)]
pub struct ScoredEntry {
pub entry: DictEntry,
pub score: f32,
}
#[derive(Debug, Clone)]
pub struct SearchResult {
pub entries: Vec<ScoredEntry>,
pub total: usize,
pub elapsed_ms: u128,
}
pub struct DictSearcher {
index: Index,
reader: IndexReader,
fields: DictxSchema,
pack: Option<Arc<EntryPackReader>>,
}
impl DictSearcher {
pub fn open(index_dir: &Path) -> Result<Self> {
let index = open_index(index_dir)?;
let fields = DictxSchema::from_schema(index.schema())?;
let pack_path = index_dir.join(ENTRY_PACK_FILE);
let pack = if pack_path.exists() {
Some(Arc::new(EntryPackReader::open(&pack_path)?))
} else {
None
};
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()
.map_err(tantivy_error)?;
Ok(Self {
index,
reader,
fields,
pack,
})
}
pub fn search(&self, request: &SearchRequest) -> Result<SearchResult> {
let start = Instant::now();
let query = self.build_query(&request.query)?;
let searcher = self.reader.searcher();
let fetch_limit = request
.limit
.saturating_add(request.offset)
.max(100)
.min(5_000);
let top_docs = searcher
.search(&query, &TopDocs::with_limit(fetch_limit))
.map_err(tantivy_error)?;
let mut entries = Vec::new();
for (score, address) in top_docs {
let doc: TantivyDocument = searcher.doc(address).map_err(tantivy_error)?;
let entry = self.entry_from_doc(&doc)?;
if filters_match(&entry, &request.filters) {
let adjusted = adjust_score(score, &entry, &request.query);
entries.push(ScoredEntry {
entry,
score: adjusted,
});
}
}
entries.sort_by(|left, right| {
right
.score
.partial_cmp(&left.score)
.unwrap_or(Ordering::Equal)
.then_with(|| left.entry.word_lower.cmp(&right.entry.word_lower))
});
let total = entries.len();
let entries = entries
.into_iter()
.skip(request.offset)
.take(request.limit)
.collect();
Ok(SearchResult {
entries,
total,
elapsed_ms: start.elapsed().as_millis(),
})
}
fn build_query(&self, query: &Query) -> Result<Box<dyn TantivyQuery>> {
match query {
Query::Exact { word } => {
let term = Term::from_field_text(self.fields.word_lower, &word.to_lowercase());
Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic)))
}
Query::Fuzzy { word, distance } => {
let word = word.to_lowercase();
let distance = (*distance).min(2);
let term = Term::from_field_text(self.fields.word_lower, &word);
let mut queries: Vec<Box<dyn TantivyQuery>> =
vec![Box::new(FuzzyTermQuery::new(term, distance, true))];
for candidate in fuzzy_exact_candidates(&word, distance) {
let term = Term::from_field_text(self.fields.word_lower, &candidate);
let query = TermQuery::new(term, IndexRecordOption::Basic);
queries.push(Box::new(BoostQuery::new(Box::new(query), 8.0)));
}
Ok(Box::new(BooleanQuery::union(queries)))
}
Query::FullText { text } => self.full_text_query(text),
Query::Chinese { text } => self.full_text_query(&expand_for_search(text)),
Query::Wildcard { pattern } => {
let regex = wildcard_to_regex(pattern);
RegexQuery::from_pattern(®ex, self.fields.word_lower)
.map(|query| Box::new(query) as Box<dyn TantivyQuery>)
.map_err(|err| DictxError::Tantivy(err.to_string()))
}
}
}
fn full_text_query(&self, text: &str) -> Result<Box<dyn TantivyQuery>> {
if text.trim().is_empty() {
return Ok(Box::new(AllQuery));
}
let parser = QueryParser::for_index(
&self.index,
vec![
self.fields.word,
self.fields.definition,
self.fields.examples,
self.fields.phrases,
self.fields.search_text,
],
);
parser
.parse_query(text)
.or_else(|_| parser.parse_query(&escape_query(text)))
.map_err(|err| DictxError::Tantivy(err.to_string()))
}
fn entry_from_doc(&self, doc: &TantivyDocument) -> Result<DictEntry> {
if let (Some(pack), Some(offset_field), Some(len_field)) =
(&self.pack, self.fields.entry_offset, self.fields.entry_len)
{
if let (Some(offset), Some(len)) = (
doc.get_first(offset_field).and_then(|value| value.as_u64()),
doc.get_first(len_field).and_then(|value| value.as_u64()),
) {
return pack.read(EntryLocator { offset, len });
}
}
if let Some(raw_json) = self.fields.raw_json {
let raw = doc
.get_first(raw_json)
.and_then(|value| value.as_str())
.ok_or_else(|| DictxError::InvalidData("索引文档缺少 raw_json".to_string()))?;
return Ok(serde_json::from_str(raw)?);
}
Err(DictxError::InvalidData(
"索引缺少二进制词条 pack,请运行 dictx build --force 重建索引".to_string(),
))
}
}
fn filters_match(entry: &DictEntry, filters: &SearchFilters) -> bool {
if let Some(source) = &filters.source {
if entry.source.slug() != source.trim() && entry.source.display_name() != *source {
return false;
}
}
if let Some(pos) = &filters.pos {
let pos = clean_pos(pos);
if !entry.pos.iter().any(|value| clean_pos(value) == pos)
&& !entry
.definitions
.iter()
.filter_map(|definition| definition.pos.as_deref())
.any(|value| clean_pos(value) == pos)
{
return false;
}
}
if let Some(tag) = &filters.tag {
let tag = normalize_tag(tag);
if !entry.tags.iter().any(|value| normalize_tag(value) == tag) {
return false;
}
}
if let Some(min) = filters.collins_min {
if entry.collins_star < min {
return false;
}
}
if let Some(min) = filters.freq_min {
if entry.freq_bnc.unwrap_or(u32::MAX) < min {
return false;
}
}
if let Some(max) = filters.freq_max {
if entry.freq_bnc.unwrap_or(u32::MAX) > max {
return false;
}
}
if filters.oxford_only && !entry.oxford_3000 {
return false;
}
true
}
fn adjust_score(score: f32, entry: &DictEntry, query: &Query) -> f32 {
let mut score = score;
let user_text = query.user_text().to_lowercase();
if entry.word_lower == user_text {
score += 100.0;
} else if entry.word_lower.starts_with(&user_text) {
score += 20.0;
}
if let Query::Fuzzy { word, .. } = query {
let word = word.to_lowercase();
let distance = damerau_levenshtein(&entry.word_lower, &word) as f32;
score += (3.0 - distance).max(0.0) * 25.0;
let prefix = common_prefix_len(&entry.word_lower, &word).min(5) as f32;
score += prefix * 3.0;
if entry.word_lower.chars().next() == word.chars().next() {
score += 5.0;
}
}
if let Query::Chinese { text } = query {
score += chinese_relevance_score(entry, text);
}
if entry.collins_star > 0 {
score += entry.collins_star as f32 * 0.2;
}
if let Some(freq) = entry.freq_bnc {
if freq <= 1_000 {
score += 1.0;
} else if freq <= 5_000 {
score += 0.5;
}
}
score
}
fn chinese_relevance_score(entry: &DictEntry, text: &str) -> f32 {
let query = text.trim();
if query.is_empty() {
return 0.0;
}
let mut score = 0.0;
let mut direct_hits = 0usize;
let mut example_hits = 0usize;
for (idx, definition) in entry.definitions.iter().enumerate() {
if contains_query(&definition.zh, query) {
direct_hits += 1;
score += 96.0 - idx.min(6) as f32 * 8.0;
if idx == 0 {
score += 18.0;
}
let zh_width = definition.zh.chars().count();
if zh_width <= query.chars().count() + 8 {
score += 18.0;
}
if starts_with_semantic_marker(&definition.zh, query) {
score += 12.0;
}
}
if contains_query(&definition.en, query) {
direct_hits += 1;
score += 20.0;
}
}
for phrase in &entry.phrases {
if contains_query(&phrase.zh, query) || contains_query(&phrase.en, query) {
direct_hits += 1;
score += 42.0;
}
}
for synonym in &entry.synonyms {
if contains_query(&synonym.zh_meaning, query) {
direct_hits += 1;
score += 38.0;
}
}
for related in &entry.related_words {
for word in &related.words {
if contains_query(&word.translation, query) {
direct_hits += 1;
score += 30.0;
}
}
}
for example in &entry.examples {
if contains_query(&example.zh, query) || contains_query(&example.en, query) {
example_hits += 1;
score += 5.0;
}
}
if direct_hits > 0 {
score += quality_score(entry);
} else if example_hits > 0 {
score -= 34.0;
}
if is_likely_proper_name(entry) {
score -= if direct_hits > 0 { 18.0 } else { 34.0 };
}
if is_inflected_plural(entry) {
score -= 12.0;
}
score
}
fn contains_query(text: &str, query: &str) -> bool {
!text.trim().is_empty() && text.contains(query)
}
fn starts_with_semantic_marker(text: &str, query: &str) -> bool {
let trimmed = text
.trim_start_matches(|ch: char| ch.is_ascii_punctuation() || ch.is_whitespace())
.trim_start_matches(|ch: char| matches!(ch, ',' | '。' | ';' | '、' | ':'));
trimmed.starts_with(query)
|| trimmed.starts_with(&format!("{},", query))
|| trimmed.starts_with(&format!("{};", query))
}
fn quality_score(entry: &DictEntry) -> f32 {
let mut score = 0.0;
for tag in &entry.tags {
score += match normalize_tag(tag).as_str() {
"zk" | "gk" | "cet4" | "cet6" | "tem4" | "tem8" | "kao_yan" => 15.0,
"toefl" | "ielts" | "gre" | "gmat" | "sat" => 8.0,
_ => 0.0,
};
}
match &entry.source {
DictSource::Anki { deck_name } if deck_name.eq_ignore_ascii_case("KaoYan_3") => {
score += 18.0
}
DictSource::Ecdict => score += 14.0,
DictSource::Sqlite { name, .. } if name == "kd_data" => score += 4.0,
_ => {}
}
if entry.collins_star > 0 {
score += entry.collins_star as f32 * 4.0;
}
if entry.oxford_3000 {
score += 14.0;
}
if let Some(freq) = entry.freq_bnc {
if freq <= 1_000 {
score += 12.0;
} else if freq <= 5_000 {
score += 7.0;
} else if freq <= 10_000 {
score += 3.0;
}
}
score
}
fn is_likely_proper_name(entry: &DictEntry) -> bool {
let starts_uppercase = entry
.word
.chars()
.next()
.is_some_and(|ch| ch.is_ascii_uppercase());
let name_definition = entry.definitions.iter().any(|definition| {
let zh = definition.zh.as_str();
zh.contains("人名") || zh.contains("地名") || zh.contains("省名") || zh.contains("男子名")
});
starts_uppercase || name_definition
}
fn is_inflected_plural(entry: &DictEntry) -> bool {
entry.word_lower.len() > 3
&& entry.word_lower.ends_with('s')
&& entry
.definitions
.iter()
.any(|definition| definition.zh.contains("复数") || definition.en.contains("plural"))
}
fn damerau_levenshtein(left: &str, right: &str) -> usize {
if left == right {
return 0;
}
if left.is_empty() {
return right.chars().count();
}
if right.is_empty() {
return left.chars().count();
}
let left_chars: Vec<char> = left.chars().collect();
let right_chars: Vec<char> = right.chars().collect();
let mut matrix = vec![vec![0usize; right_chars.len() + 1]; left_chars.len() + 1];
for i in 0..=left_chars.len() {
matrix[i][0] = i;
}
for j in 0..=right_chars.len() {
matrix[0][j] = j;
}
for i in 1..=left_chars.len() {
for j in 1..=right_chars.len() {
let cost = usize::from(left_chars[i - 1] != right_chars[j - 1]);
matrix[i][j] = (matrix[i - 1][j] + 1)
.min(matrix[i][j - 1] + 1)
.min(matrix[i - 1][j - 1] + cost);
if i > 1
&& j > 1
&& left_chars[i - 1] == right_chars[j - 2]
&& left_chars[i - 2] == right_chars[j - 1]
{
matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + 1);
}
}
}
matrix[left_chars.len()][right_chars.len()]
}
fn common_prefix_len(left: &str, right: &str) -> usize {
left.chars()
.zip(right.chars())
.take_while(|(left, right)| left == right)
.count()
}
fn fuzzy_exact_candidates(word: &str, distance: u8) -> Vec<String> {
if distance == 0 {
return Vec::new();
}
let chars: Vec<char> = word.chars().collect();
if !(2..=32).contains(&chars.len()) {
return Vec::new();
}
let mut candidates = BTreeSet::new();
for idx in 0..chars.len() {
let mut candidate = String::with_capacity(word.len());
for (char_idx, ch) in chars.iter().enumerate() {
if char_idx != idx {
candidate.push(*ch);
}
}
if candidate != word {
candidates.insert(candidate);
}
}
for idx in 0..chars.len().saturating_sub(1) {
let mut candidate = chars.clone();
candidate.swap(idx, idx + 1);
let candidate: String = candidate.into_iter().collect();
if candidate != word {
candidates.insert(candidate);
}
}
candidates.into_iter().collect()
}
#[allow(dead_code)]
fn levenshtein(left: &str, right: &str) -> usize {
let right_chars: Vec<char> = right.chars().collect();
let mut costs: Vec<usize> = (0..=right_chars.len()).collect();
for (i, left_ch) in left.chars().enumerate() {
let mut previous = costs[0];
costs[0] = i + 1;
for (j, right_ch) in right_chars.iter().enumerate() {
let old = costs[j + 1];
let substitution = previous + usize::from(left_ch != *right_ch);
let insertion = costs[j] + 1;
let deletion = old + 1;
costs[j + 1] = substitution.min(insertion).min(deletion);
previous = old;
}
}
costs[right_chars.len()]
}
fn wildcard_to_regex(pattern: &str) -> String {
let mut out = String::from("^");
for ch in pattern.chars() {
match ch {
'*' => out.push_str(".*"),
'?' => out.push('.'),
_ => out.push_str(®ex_escape_char(ch)),
}
}
out.push('$');
out
}
fn regex_escape_char(ch: char) -> String {
if matches!(
ch,
'.' | '+' | '(' | ')' | '|' | '^' | '$' | '[' | ']' | '{' | '}' | '\\'
) {
format!("\\{ch}")
} else {
ch.to_string()
}
}
fn escape_query(text: &str) -> String {
text.chars()
.map(|ch| {
if matches!(
ch,
':' | '"' | '[' | ']' | '{' | '}' | '(' | ')' | '^' | '~' | '*'
) {
' '
} else {
ch
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use dictx_core::{Definition, DictSource, Example};
use dictx_index::{build_index, BuildOptions};
#[test]
fn searches_built_index() {
let dir = tempfile::tempdir().unwrap();
let mut entry = DictEntry::new(
DictSource::Custom {
name: "test".into(),
},
"apple",
);
entry
.definitions
.push(Definition::new("round fruit", "苹果", Some("n".into())));
build_index(
dir.path(),
vec![Ok(entry)],
&BuildOptions {
force: true,
..BuildOptions::default()
},
)
.unwrap();
let searcher = DictSearcher::open(dir.path()).unwrap();
let result = searcher
.search(&SearchRequest::new(Query::Chinese {
text: "苹果".to_string(),
}))
.unwrap();
assert_eq!(result.entries[0].entry.word, "apple");
}
#[test]
fn fuzzy_search_promotes_one_step_exact_candidates() {
let dir = tempfile::tempdir().unwrap();
let entries = ["cuss", "Russ", "bus"].into_iter().map(|word| {
let mut entry = DictEntry::new(
DictSource::Custom {
name: "test".into(),
},
word,
);
entry
.definitions
.push(Definition::new("test entry", "", Some("n".into())));
Ok(entry)
});
build_index(
dir.path(),
entries,
&BuildOptions {
force: true,
..BuildOptions::default()
},
)
.unwrap();
let searcher = DictSearcher::open(dir.path()).unwrap();
let result = searcher
.search(&SearchRequest::new(Query::Fuzzy {
word: "buss".to_string(),
distance: 1,
}))
.unwrap();
assert_eq!(result.entries[0].entry.word_lower, "bus");
}
#[test]
fn chinese_definition_hits_beat_example_only_hits() {
let mut guy = DictEntry::new(
DictSource::Anki {
deck_name: "KaoYan_3".into(),
},
"guy",
);
guy.tags.push("kao_yan".into());
guy.definitions
.push(Definition::new("a man", "家伙,伙计", Some("n".into())));
let mut place = DictEntry::new(
DictSource::Sqlite {
name: "kd_data".into(),
table: "en".into(),
},
"Leinster",
);
place.definitions.push(Definition::new(
"",
"伦斯特省(爱尔兰省名)",
Some("n".into()),
));
place.examples.push(Example {
en: "The collector and his fellow laughed.".into(),
zh: "收费员和他的一个伙计狠狠嘲笑了我一番。".into(),
});
let query = Query::Chinese {
text: "伙计".into(),
};
assert!(
chinese_relevance_score(&guy, query.user_text())
> chinese_relevance_score(&place, query.user_text())
);
}
}