use std::{collections::HashMap, sync::Arc};
use tantivy::{
Index, IndexReader, IndexWriter, TantivyDocument, Term,
collector::TopDocs,
query::{BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery},
schema::{Field, IndexRecordOption, OwnedValue, STORED, STRING, Schema, TEXT},
};
use super::{extract_searchable_text, iter_searchable};
use crate::{common::to_value::ToValue, core::item::AnyItem};
pub struct SearchIndex {
_index: Index,
reader: IndexReader,
writer: std::sync::Mutex<IndexWriter>,
entity_type_field: Field,
entity_id_field: Field,
content_field: Field,
searchable_fields: HashMap<&'static str, &'static [&'static str]>,
}
impl SearchIndex {
pub fn new() -> Self {
let mut schema_builder = Schema::builder();
let entity_type_field = schema_builder.add_text_field("entity_type", STRING | STORED);
let entity_id_field = schema_builder.add_text_field("entity_id", STRING | STORED);
let content_field = schema_builder.add_text_field("content", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let writer = index
.writer(50_000_000)
.expect("failed to create tantivy index writer");
let reader = index
.reader_builder()
.reload_policy(tantivy::ReloadPolicy::Manual)
.try_into()
.expect("failed to create tantivy index reader");
let mut searchable_fields = HashMap::new();
for reg in iter_searchable() {
searchable_fields.insert(reg.entity_type, reg.fields);
}
let registered_count = searchable_fields.len();
log::info!(
"SearchIndex initialized with {} searchable entity types",
registered_count
);
Self {
_index: index,
reader,
writer: std::sync::Mutex::new(writer),
entity_type_field,
entity_id_field,
content_field,
searchable_fields,
}
}
pub fn index_item(&self, item: &Arc<dyn AnyItem>) {
static DISABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
if *DISABLED.get_or_init(|| std::env::var("MYKO_SEARCH_INDEX_DISABLED").is_ok()) {
return;
}
let entity_type = item.entity_type();
let Some(fields) = self.searchable_fields.get(entity_type) else {
return;
};
let id = item.id();
let value = item.to_value();
let text = extract_searchable_text(&value, fields);
if text.is_empty() {
return;
}
self.index_entity(entity_type, &id, &text);
}
fn index_entity(&self, entity_type: &str, entity_id: &str, content: &str) {
let Ok(writer) = self.writer.lock() else {
log::error!("SearchIndex: writer mutex poisoned");
return;
};
let delete_term = tantivy::Term::from_field_text(self.entity_id_field, entity_id);
writer.delete_term(delete_term);
let mut doc = TantivyDocument::new();
doc.add_text(self.entity_type_field, entity_type);
doc.add_text(self.entity_id_field, entity_id);
doc.add_text(self.content_field, content);
writer.add_document(doc).ok();
}
pub fn remove_entity(&self, entity_id: &str) {
let Ok(writer) = self.writer.lock() else {
log::error!("SearchIndex: writer mutex poisoned");
return;
};
let delete_term = tantivy::Term::from_field_text(self.entity_id_field, entity_id);
writer.delete_term(delete_term);
}
pub fn commit(&self) {
let Ok(mut writer) = self.writer.lock() else {
log::error!("SearchIndex: writer mutex poisoned");
return;
};
if let Err(e) = writer.commit() {
log::error!("SearchIndex: commit failed: {}", e);
return;
}
if let Err(e) = self.reader.reload() {
log::error!("SearchIndex: reader reload failed: {}", e);
}
}
pub fn search(&self, entity_type: &str, query: &str, limit: usize) -> Vec<Arc<str>> {
if query.is_empty() || limit == 0 {
return vec![];
}
self.commit();
let searcher = self.reader.searcher();
let type_query: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(self.entity_type_field, entity_type),
IndexRecordOption::Basic,
));
let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = vec![(Occur::Must, type_query)];
for word in query.split_whitespace() {
let lower = word.to_lowercase();
let term = Term::from_field_text(self.content_field, &lower);
let prefix: Box<dyn Query> =
Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, true));
if lower.len() >= 4 {
let fuzzy: Box<dyn Query> = Box::new(FuzzyTermQuery::new(term, 1, true));
let either =
BooleanQuery::from(vec![(Occur::Should, prefix), (Occur::Should, fuzzy)]);
must_clauses.push((Occur::Must, Box::new(either)));
} else {
must_clauses.push((Occur::Must, prefix));
}
}
let combined = BooleanQuery::from(must_clauses);
let top_docs = match searcher.search(&combined, &TopDocs::with_limit(limit)) {
Ok(docs) => docs,
Err(e) => {
log::error!("SearchIndex: search error: {}", e);
return vec![];
}
};
top_docs
.into_iter()
.filter_map(|(_score, doc_address)| {
let doc: TantivyDocument = searcher.doc(doc_address).ok()?;
match doc.get_first(self.entity_id_field)? {
OwnedValue::Str(s) => Some(Arc::<str>::from(s.as_str())),
_ => None,
}
})
.collect()
}
pub fn is_searchable(&self, entity_type: &str) -> bool {
self.searchable_fields.contains_key(entity_type)
}
pub fn build_from_registry(&self, registry: &crate::store::StoreRegistry) {
use hyphae::Gettable;
let mut count = 0;
for (entity_type, fields) in &self.searchable_fields {
let Some(store) = registry.get(entity_type) else {
continue;
};
let entries = store.entries().get();
for (id, item) in entries.iter() {
let value = item.to_value();
let text = extract_searchable_text(&value, fields);
if !text.is_empty() {
self.index_entity(entity_type, id, &text);
count += 1;
}
}
}
self.commit();
log::info!("SearchIndex: built initial index with {} entities", count);
}
}
impl Default for SearchIndex {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_search_index_creation() {
let index = SearchIndex::new();
assert!(index.search("NonExistent", "test", 10).is_empty());
}
#[test]
fn test_index_and_search() {
let index = SearchIndex::new();
index.index_entity("TestEntity", "id-1", "hello world search test");
index.index_entity("TestEntity", "id-2", "another document here");
index.index_entity("OtherEntity", "id-3", "hello from other type");
index.commit();
let results = index.search("TestEntity", "hello", 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].as_ref(), "id-1");
let results = index.search("OtherEntity", "hello", 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].as_ref(), "id-3");
}
#[test]
fn test_index_upsert() {
let index = SearchIndex::new();
index.index_entity("TestEntity", "id-1", "original text");
index.commit();
index.index_entity("TestEntity", "id-1", "updated text");
index.commit();
let results = index.search("TestEntity", "updated", 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].as_ref(), "id-1");
let results = index.search("TestEntity", "original", 10);
assert!(results.is_empty());
}
#[test]
fn test_remove_entity() {
let index = SearchIndex::new();
index.index_entity("TestEntity", "id-1", "hello world");
index.commit();
index.remove_entity("id-1");
index.commit();
let results = index.search("TestEntity", "hello", 10);
assert!(results.is_empty());
}
#[test]
fn test_empty_query() {
let index = SearchIndex::new();
assert!(index.search("TestEntity", "", 10).is_empty());
}
#[test]
fn test_zero_limit() {
let index = SearchIndex::new();
index.index_entity("TestEntity", "id-1", "hello world");
index.commit();
assert!(index.search("TestEntity", "hello", 0).is_empty());
}
#[test]
fn test_prefix_search() {
let index = SearchIndex::new();
index.index_entity("Target", "id-1", "base1");
index.index_entity("Target", "id-2", "base2");
index.index_entity("Target", "id-3", "camera");
index.commit();
let results = index.search("Target", "ba", 10);
assert_eq!(
results.len(),
2,
"prefix 'ba' should match base1 and base2, got: {:?}",
results
);
let results = index.search("Target", "base", 10);
assert_eq!(
results.len(),
2,
"prefix 'base' should match base1 and base2, got: {:?}",
results
);
let results = index.search("Target", "cam", 10);
assert_eq!(
results.len(),
1,
"prefix 'cam' should match camera, got: {:?}",
results
);
let results = index.search("Target", "camra", 10);
assert_eq!(
results.len(),
1,
"fuzzy 'camra' should match camera, got: {:?}",
results
);
let results = index.search("Target", "ba", 10);
assert_eq!(
results.len(),
2,
"'ba' should only prefix-match base1/base2, got: {:?}",
results
);
index.index_entity("Target", "id-4", "light panel front");
index.commit();
let results = index.search("Target", "light front", 10);
assert_eq!(
results.len(),
1,
"multi-word should match id-4, got: {:?}",
results
);
}
}