use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use super::index::FtsEngine;
use super::ranking::{Bm25Params, bm25_score};
use super::tokenizer::{fuzzy_match, levenshtein_distance};
use super::types::{IndexError, InvertedIndex, SearchHit, SearchOptions};
impl FtsEngine {
pub fn search(
&self,
dataset: &str,
query: &str,
options: &SearchOptions,
) -> Result<Vec<SearchHit>, IndexError> {
let index = self
.index(dataset)
.ok_or_else(|| IndexError::DatasetNotFound(dataset.to_string()))?;
let query_terms = self.tokenizer().tokenize_query(query);
if query_terms.is_empty() {
return Ok(Vec::new());
}
let search_terms = if options.fuzzy {
expand_fuzzy_terms(&query_terms, index, options.fuzzy_distance)
} else {
query_terms.clone()
};
let params = Bm25Params::default();
let mut doc_scores: BTreeMap<u64, DocumentScore> = BTreeMap::new();
for term in &search_terms {
if let Some(posting_list) = index.index.get(term) {
for posting in &posting_list.postings {
let doc_meta = match index.docs.get(&posting.object_id) {
Some(meta) => meta,
None => continue,
};
let score = bm25_score(
posting.term_freq,
doc_meta.length,
index.avg_doc_len,
index.doc_count,
posting_list.doc_freq,
¶ms,
);
let entry = doc_scores
.entry(posting.object_id)
.or_insert_with(|| DocumentScore::new(posting.object_id));
entry.total_score += score;
entry.matched_terms.push(term.clone());
entry.positions.extend(posting.positions.iter().copied());
}
}
}
let doc_scores: Vec<_> = doc_scores
.into_values()
.filter(|ds| ds.total_score >= options.min_score)
.collect();
let mut sorted_scores = doc_scores;
sorted_scores.sort_by(|a, b| {
b.total_score
.partial_cmp(&a.total_score)
.unwrap_or(core::cmp::Ordering::Equal)
});
let paginated: Vec<_> = sorted_scores
.into_iter()
.skip(options.offset)
.take(options.limit)
.collect();
let results = paginated
.into_iter()
.filter_map(|ds| {
let doc_meta = index.docs.get(&ds.object_id)?;
let snippet = if options.highlight {
generate_highlighted_snippet(
&ds.matched_terms,
&ds.positions,
options.snippet_length,
)
} else {
String::new()
};
Some(SearchHit {
object_id: ds.object_id,
path: doc_meta.path.clone(),
score: ds.total_score,
snippet,
matched_terms: ds.matched_terms,
positions: ds.positions,
})
})
.collect();
Ok(results)
}
}
struct DocumentScore {
object_id: u64,
total_score: f32,
matched_terms: Vec<String>,
positions: Vec<u32>,
}
impl DocumentScore {
fn new(object_id: u64) -> Self {
Self {
object_id,
total_score: 0.0,
matched_terms: Vec::new(),
positions: Vec::new(),
}
}
}
fn expand_fuzzy_terms(
query_terms: &[String],
index: &InvertedIndex,
max_distance: usize,
) -> Vec<String> {
let mut expanded = Vec::new();
for term in query_terms {
expanded.push(term.clone());
let matches = fuzzy_match(term, index.index.keys().map(|s| s.as_str()), max_distance);
for matched in matches {
if &matched != term {
expanded.push(matched);
}
}
}
expanded
}
fn generate_highlighted_snippet(
matched_terms: &[String],
positions: &[u32],
_max_length: usize,
) -> String {
if matched_terms.is_empty() {
return String::new();
}
let terms = matched_terms.join(", ");
let pos_str = if positions.is_empty() {
String::new()
} else {
let first_pos = positions.iter().min().unwrap_or(&0);
alloc::format!(" at position {}", first_pos)
};
alloc::format!("Matched: {}{}", terms, pos_str)
}
pub fn phrase_search(index: &InvertedIndex, phrase_terms: &[String]) -> Vec<u64> {
if phrase_terms.is_empty() {
return Vec::new();
}
if phrase_terms.len() == 1 {
return match index.index.get(&phrase_terms[0]) {
Some(pl) => pl.postings.iter().map(|p| p.object_id).collect(),
None => Vec::new(),
};
}
let first_term = &phrase_terms[0];
let first_postings = match index.index.get(first_term) {
Some(pl) => &pl.postings,
None => return Vec::new(),
};
let mut candidates: Vec<u64> = first_postings.iter().map(|p| p.object_id).collect();
for term in phrase_terms.iter().skip(1) {
let postings = match index.index.get(term) {
Some(pl) => &pl.postings,
None => return Vec::new(),
};
let term_docs: Vec<u64> = postings.iter().map(|p| p.object_id).collect();
candidates.retain(|doc_id| term_docs.contains(doc_id));
if candidates.is_empty() {
return Vec::new();
}
}
let mut phrase_matches = Vec::new();
for doc_id in candidates {
if check_phrase_positions(index, doc_id, phrase_terms) {
phrase_matches.push(doc_id);
}
}
phrase_matches
}
fn check_phrase_positions(index: &InvertedIndex, doc_id: u64, phrase_terms: &[String]) -> bool {
let first_positions = match index.index.get(&phrase_terms[0]) {
Some(pl) => match pl.postings.iter().find(|p| p.object_id == doc_id) {
Some(posting) => &posting.positions,
None => return false,
},
None => return false,
};
'outer: for &start_pos in first_positions {
let mut expected_pos = start_pos;
for (offset, term) in phrase_terms.iter().enumerate() {
expected_pos = start_pos + offset as u32;
let positions = match index.index.get(term) {
Some(pl) => match pl.postings.iter().find(|p| p.object_id == doc_id) {
Some(posting) => &posting.positions,
None => continue 'outer,
},
None => continue 'outer,
};
if !positions.contains(&expected_pos) {
continue 'outer;
}
}
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fts::index::FtsEngine;
#[test]
fn test_basic_search() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world foo bar")
.unwrap();
engine
.index_document("test", 2, "/doc2.txt", b"hello there friend")
.unwrap();
engine
.index_document("test", 3, "/doc3.txt", b"goodbye world")
.unwrap();
let options = SearchOptions::default();
let results = engine.search("test", "hello", &options).unwrap();
assert_eq!(results.len(), 2);
}
#[test]
fn test_multi_term_search() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world")
.unwrap();
engine
.index_document("test", 2, "/doc2.txt", b"hello there")
.unwrap();
let options = SearchOptions::default();
let results = engine.search("test", "hello world", &options).unwrap();
assert!(!results.is_empty());
assert_eq!(results[0].object_id, 1);
}
#[test]
fn test_empty_query() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world")
.unwrap();
let options = SearchOptions::default();
let results = engine.search("test", "", &options).unwrap();
assert!(results.is_empty());
}
}