use parking_lot::Mutex;
use std::fmt;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, SchemaBuilder, STORED, STRING, TEXT};
use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument};
pub const TEXT_NAMESPACE: &str = "http://jena.apache.org/text#";
#[derive(Debug)]
pub enum TextSearchError {
IndexError(String),
QueryError(String),
SchemaError(String),
}
impl fmt::Display for TextSearchError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::IndexError(msg) => write!(f, "text index error: {msg}"),
Self::QueryError(msg) => write!(f, "text query error: {msg}"),
Self::SchemaError(msg) => write!(f, "text schema error: {msg}"),
}
}
}
impl std::error::Error for TextSearchError {}
impl From<tantivy::TantivyError> for TextSearchError {
fn from(e: tantivy::TantivyError) -> Self {
Self::IndexError(e.to_string())
}
}
impl From<tantivy::query::QueryParserError> for TextSearchError {
fn from(e: tantivy::query::QueryParserError) -> Self {
Self::QueryError(e.to_string())
}
}
#[derive(Debug, Clone)]
pub struct TextSearchResult {
pub subject_iri: String,
pub score: f32,
pub literal_value: String,
pub predicate_iri: String,
}
const FIELD_SUBJECT: &str = "subject";
const FIELD_LITERAL: &str = "literal";
const FIELD_PREDICATE: &str = "predicate";
pub struct TextSearchIndex {
index: Index,
reader: IndexReader,
writer: Mutex<IndexWriter>,
subject_field: Field,
literal_field: Field,
predicate_field: Field,
}
impl TextSearchIndex {
const WRITER_HEAP_BYTES: usize = 50_000_000;
pub fn new_in_memory() -> Result<Self, TextSearchError> {
let (schema, subject_field, literal_field, predicate_field) = Self::build_schema();
let index = Index::create_in_ram(schema);
let writer: IndexWriter = index
.writer(Self::WRITER_HEAP_BYTES)
.map_err(TextSearchError::from)?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.map_err(TextSearchError::from)?;
Ok(Self {
index,
reader,
writer: Mutex::new(writer),
subject_field,
literal_field,
predicate_field,
})
}
pub fn index_triple(
&self,
subject: &str,
predicate: &str,
literal: &str,
) -> Result<(), TextSearchError> {
let mut doc = TantivyDocument::default();
doc.add_text(self.subject_field, subject);
doc.add_text(self.predicate_field, predicate);
doc.add_text(self.literal_field, literal);
self.writer
.lock()
.add_document(doc)
.map_err(TextSearchError::from)?;
Ok(())
}
pub fn commit(&self) -> Result<(), TextSearchError> {
self.writer
.lock()
.commit()
.map(|_| ())
.map_err(TextSearchError::from)?;
self.reader.reload().map_err(TextSearchError::from)
}
pub fn search(
&self,
query_str: &str,
max_results: usize,
) -> Result<Vec<TextSearchResult>, TextSearchError> {
self.run_search(query_str, None, max_results)
}
pub fn search_predicate(
&self,
query_str: &str,
predicate: &str,
max_results: usize,
) -> Result<Vec<TextSearchResult>, TextSearchError> {
self.run_search(query_str, Some(predicate), max_results)
}
pub fn num_docs(&self) -> u64 {
self.reader.searcher().num_docs()
}
fn build_schema() -> (Schema, Field, Field, Field) {
let mut builder: SchemaBuilder = Schema::builder();
let subject_field = builder.add_text_field(FIELD_SUBJECT, STRING | STORED);
let predicate_field = builder.add_text_field(FIELD_PREDICATE, STRING | STORED);
let literal_field = builder.add_text_field(FIELD_LITERAL, TEXT | STORED);
let schema = builder.build();
(schema, subject_field, literal_field, predicate_field)
}
fn run_search(
&self,
query_str: &str,
predicate_filter: Option<&str>,
max_results: usize,
) -> Result<Vec<TextSearchResult>, TextSearchError> {
if query_str.is_empty() {
return Ok(Vec::new());
}
let searcher = self.reader.searcher();
let query_parser = QueryParser::for_index(&self.index, vec![self.literal_field]);
let query = query_parser
.parse_query(query_str)
.map_err(TextSearchError::from)?;
let fetch_limit = if predicate_filter.is_some() {
max_results.saturating_mul(10).max(max_results + 50)
} else {
max_results
};
let top_docs = searcher
.search(&query, &TopDocs::with_limit(fetch_limit).order_by_score())
.map_err(TextSearchError::from)?;
let mut results = Vec::with_capacity(top_docs.len());
for (score, doc_address) in top_docs {
let doc: TantivyDocument = searcher.doc(doc_address).map_err(TextSearchError::from)?;
let subject_iri = self.get_stored_str(&doc, self.subject_field);
let literal_value = self.get_stored_str(&doc, self.literal_field);
let predicate_iri = self.get_stored_str(&doc, self.predicate_field);
if let Some(pred) = predicate_filter {
if predicate_iri != pred {
continue;
}
}
results.push(TextSearchResult {
subject_iri,
score,
literal_value,
predicate_iri,
});
if results.len() >= max_results {
break;
}
}
Ok(results)
}
fn get_stored_str(&self, doc: &TantivyDocument, field: Field) -> String {
use tantivy::schema::Value;
doc.get_first(field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string()
}
}