use std::cmp::{max, min};
use std::sync::Arc;
use regex::Regex;
use super::scoring_utils::{
edit_distance, fold_with_byte_map, generate_trigrams, intersect_sorted_vecs, ngram_threshold,
token_match_distance, tokenize_raw, HighlightSink,
};
use crate::fieldnorm::FieldNormReader;
use crate::query::bm25::Bm25Weight;
use crate::query::{EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::document::Value;
use crate::schema::{Field, IndexRecordOption, Term};
use crate::index::SegmentId;
use crate::{DocId, DocSet, InvertedIndexReader, Score, SegmentReader, LucivyDocument, TERMINATED};
fn collect_posting_docs(
inverted_index: &InvertedIndexReader,
term: &Term,
) -> crate::Result<Vec<DocId>> {
let term_info = match inverted_index.get_term_info(term)? {
Some(ti) => ti,
None => return Ok(Vec::new()),
};
let mut docs = Vec::new();
let mut block_postings =
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
loop {
let block = block_postings.docs();
if block.is_empty() {
break;
}
docs.extend_from_slice(block);
block_postings.advance();
}
Ok(docs)
}
fn ngram_candidates_for_token(
token: &str,
ngram_field: Field,
ngram_inverted: &InvertedIndexReader,
fuzzy_distance: u8,
) -> crate::Result<Vec<DocId>> {
let trigrams = generate_trigrams(token);
let threshold = ngram_threshold(trigrams.len(), fuzzy_distance);
if trigrams.is_empty() {
return Ok(Vec::new());
}
let mut all_docs: Vec<DocId> = Vec::new();
for trigram in &trigrams {
let term = Term::from_field_text(ngram_field, trigram);
let docs = collect_posting_docs(ngram_inverted, &term)?;
all_docs.extend(docs);
}
all_docs.sort_unstable();
let mut candidates = Vec::new();
let mut i = 0;
while i < all_docs.len() {
let doc = all_docs[i];
let mut count = 0usize;
while i < all_docs.len() && all_docs[i] == doc {
count += 1;
i += 1;
}
if count >= threshold {
candidates.push(doc);
}
}
Ok(candidates)
}
#[derive(Clone, Debug)]
#[allow(missing_docs)]
pub struct FuzzyParams {
pub tokens: Vec<String>,
pub separators: Vec<String>,
pub prefix: String,
pub suffix: String,
pub fuzzy_distance: u8,
pub distance_budget: u32,
pub strict_separators: bool,
}
#[derive(Clone, Debug)]
pub struct RegexParams {
pub compiled: Regex,
pub literals: Vec<String>,
pub fuzzy_distance: u8,
}
#[derive(Clone, Debug)]
pub enum VerificationMode {
Fuzzy(FuzzyParams),
Regex(RegexParams),
}
#[derive(Clone, Debug)]
pub struct NgramContainsQuery {
raw_field: Field,
ngram_field: Field,
stored_field: Option<Field>,
trigram_sources: Vec<String>,
verification: VerificationMode,
highlight_sink: Option<Arc<HighlightSink>>,
highlight_field_name: String,
}
impl NgramContainsQuery {
pub fn new(
raw_field: Field,
ngram_field: Field,
stored_field: Option<Field>,
trigram_sources: Vec<String>,
verification: VerificationMode,
) -> Self {
NgramContainsQuery {
raw_field,
ngram_field,
stored_field,
trigram_sources,
verification,
highlight_sink: None,
highlight_field_name: String::new(),
}
}
pub fn with_highlight_sink(mut self, sink: Arc<HighlightSink>, field_name: String) -> Self {
self.highlight_sink = Some(sink);
self.highlight_field_name = field_name;
self
}
}
impl Query for NgramContainsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let bm25_weight = match enable_scoring {
EnableScoring::Enabled {
statistics_provider,
..
} => {
let terms: Vec<Term> = self
.trigram_sources
.iter()
.map(|t| Term::from_field_text(self.raw_field, t))
.collect();
if terms.is_empty() {
Bm25Weight::for_one_term(0, 1, 1.0)
} else {
Bm25Weight::for_terms(statistics_provider, &terms)?
}
}
EnableScoring::Disabled { .. } => Bm25Weight::for_one_term(0, 1, 1.0),
};
Ok(Box::new(NgramContainsWeight {
raw_field: self.raw_field,
ngram_field: self.ngram_field,
stored_field: self.stored_field,
trigram_sources: self.trigram_sources.clone(),
verification: self.verification.clone(),
highlight_sink: self.highlight_sink.clone(),
highlight_field_name: self.highlight_field_name.clone(),
bm25_weight,
}))
}
}
struct NgramContainsWeight {
raw_field: Field,
ngram_field: Field,
stored_field: Option<Field>,
trigram_sources: Vec<String>,
verification: VerificationMode,
highlight_sink: Option<Arc<HighlightSink>>,
highlight_field_name: String,
bm25_weight: Bm25Weight,
}
impl Weight for NgramContainsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let segment_id = reader.segment_id();
let raw_inverted = reader.inverted_index(self.raw_field)?;
let ngram_inverted = reader.inverted_index(self.ngram_field)?;
let final_candidates = match &self.verification {
VerificationMode::Fuzzy(params) => {
let mut per_token_candidates: Vec<Vec<DocId>> = Vec::new();
for source in &self.trigram_sources {
let term = Term::from_field_text(self.raw_field, source);
let exact_docs = collect_posting_docs(&raw_inverted, &term)?;
if !exact_docs.is_empty() {
per_token_candidates.push(exact_docs);
continue;
}
let candidates = ngram_candidates_for_token(
source,
self.ngram_field,
&ngram_inverted,
params.fuzzy_distance,
)?;
per_token_candidates.push(candidates);
}
intersect_sorted_vecs(per_token_candidates)
}
VerificationMode::Regex(params) => {
if self.trigram_sources.is_empty() {
(0..reader.max_doc()).collect()
} else {
let mut all_candidates: Vec<DocId> = Vec::new();
for source in &self.trigram_sources {
let candidates = ngram_candidates_for_token(
source,
self.ngram_field,
&ngram_inverted,
params.fuzzy_distance,
)?;
all_candidates.extend(candidates);
}
all_candidates.sort_unstable();
all_candidates.dedup();
all_candidates
}
}
};
if final_candidates.is_empty() {
return Ok(Box::new(EmptyScorer));
}
let store_reader = reader
.get_store_reader(50)
.map_err(crate::LucivyError::from)?;
let text_field = self.stored_field.unwrap_or(self.raw_field);
let fieldnorm_reader = if let Some(fnr) = reader
.fieldnorms_readers()
.get_field(self.raw_field)?
{
fnr
} else {
FieldNormReader::constant(reader.max_doc(), 1)
};
Ok(Box::new(NgramContainsScorer::new(
final_candidates,
store_reader,
text_field,
self.verification.clone(),
self.bm25_weight.boost_by(boost),
fieldnorm_reader,
self.highlight_sink.clone(),
self.highlight_field_name.clone(),
segment_id,
)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(crate::LucivyError::InvalidArgument(format!(
"Document {doc} does not match"
)));
}
Ok(Explanation::new("NgramContainsScorer", scorer.score()))
}
}
fn count_single_token_fuzzy(
stored_text: &str,
doc_tokens: &[(usize, usize)],
params: &FuzzyParams,
highlight_sink: &Option<Arc<HighlightSink>>,
highlight_field_name: &str,
segment_id: SegmentId,
doc_id: DocId,
) -> u32 {
let query_token = ¶ms.tokens[0];
let mut count = 0u32;
for &(start, end) in doc_tokens {
let doc_token = stored_text[start..end].to_lowercase();
let distance = match token_match_distance(&doc_token, query_token, params.fuzzy_distance) {
Some(d) => {
d
},
None => {
continue;
},
};
let mut total_distance = distance;
if !params.prefix.is_empty() {
if params.strict_separators {
let prefix_len = params.prefix.len();
let doc_prefix_start = start.saturating_sub(prefix_len);
let doc_prefix = &stored_text[doc_prefix_start..start];
total_distance += edit_distance(¶ms.prefix, doc_prefix);
if total_distance > params.distance_budget {
continue;
}
} else {
if start == 0 {
continue;
}
if stored_text.as_bytes()[start - 1].is_ascii_alphanumeric() {
continue;
}
}
}
if !params.suffix.is_empty() {
if params.strict_separators {
let suffix_len = params.suffix.len();
let doc_suffix_end = min(end + suffix_len, stored_text.len());
let doc_suffix = &stored_text[end..doc_suffix_end];
total_distance += edit_distance(¶ms.suffix, doc_suffix);
if total_distance > params.distance_budget {
continue;
}
} else {
if end >= stored_text.len() {
continue;
}
if stored_text.as_bytes()[end].is_ascii_alphanumeric() {
continue;
}
}
}
count += 1;
if let Some(sink) = highlight_sink {
sink.insert(segment_id, doc_id, highlight_field_name, vec![[start, end]]);
}
}
count
}
fn count_multi_token_fuzzy(
stored_text: &str,
doc_tokens: &[(usize, usize)],
params: &FuzzyParams,
highlight_sink: &Option<Arc<HighlightSink>>,
highlight_field_name: &str,
segment_id: SegmentId,
doc_id: DocId,
) -> u32 {
let num_query = params.tokens.len();
if doc_tokens.len() < num_query {
return 0;
}
let mut count = 0u32;
for start_idx in 0..=(doc_tokens.len() - num_query) {
if check_at_position_fuzzy(
stored_text,
doc_tokens,
start_idx,
params,
highlight_sink,
highlight_field_name,
segment_id,
doc_id,
) {
count += 1;
}
}
count
}
fn check_at_position_fuzzy(
stored_text: &str,
doc_tokens: &[(usize, usize)],
start_idx: usize,
params: &FuzzyParams,
highlight_sink: &Option<Arc<HighlightSink>>,
highlight_field_name: &str,
segment_id: SegmentId,
doc_id: DocId,
) -> bool {
let mut total_distance = 0u32;
for (q_idx, query_token) in params.tokens.iter().enumerate() {
let (start, end) = doc_tokens[start_idx + q_idx];
let doc_token = stored_text[start..end].to_lowercase();
match token_match_distance(&doc_token, query_token, params.fuzzy_distance) {
Some(d) => total_distance += d,
None => return false,
}
if total_distance > params.distance_budget {
return false;
}
}
for (sep_idx, query_sep) in params.separators.iter().enumerate() {
let (_, end_i) = doc_tokens[start_idx + sep_idx];
let (start_next, _) = doc_tokens[start_idx + sep_idx + 1];
if end_i > stored_text.len() || start_next > stored_text.len() || end_i > start_next {
return false;
}
let doc_sep = &stored_text[end_i..start_next];
if params.strict_separators {
total_distance += edit_distance(query_sep, doc_sep);
if total_distance > params.distance_budget {
return false;
}
} else if doc_sep.is_empty() || !doc_sep.bytes().any(|b| !b.is_ascii_alphanumeric()) {
return false;
}
}
if !params.prefix.is_empty() {
let (first_start, _) = doc_tokens[start_idx];
if params.strict_separators {
let prefix_len = params.prefix.len();
let doc_prefix_start = first_start.saturating_sub(prefix_len);
let doc_prefix = &stored_text[doc_prefix_start..first_start];
total_distance += edit_distance(¶ms.prefix, doc_prefix);
if total_distance > params.distance_budget {
return false;
}
} else {
if first_start == 0 {
return false;
}
let before = &stored_text[..first_start];
if before
.as_bytes()
.last()
.is_none_or(|b| b.is_ascii_alphanumeric())
{
return false;
}
}
}
if !params.suffix.is_empty() {
let num_query = params.tokens.len();
let (_, last_end) = doc_tokens[start_idx + num_query - 1];
if params.strict_separators {
let suffix_len = params.suffix.len();
let doc_suffix_end = min(last_end + suffix_len, stored_text.len());
let doc_suffix = &stored_text[last_end..doc_suffix_end];
total_distance += edit_distance(¶ms.suffix, doc_suffix);
if total_distance > params.distance_budget {
return false;
}
} else {
if last_end >= stored_text.len() {
return false;
}
if stored_text.as_bytes()[last_end].is_ascii_alphanumeric() {
return false;
}
}
}
if let Some(sink) = highlight_sink {
let offsets: Vec<[usize; 2]> = (0..params.tokens.len())
.map(|i| {
let (s, e) = doc_tokens[start_idx + i];
[s, e]
})
.collect();
sink.insert(segment_id, doc_id, highlight_field_name, offsets);
}
true
}
fn verify_regex(
stored_text: &str,
params: &RegexParams,
highlight_sink: &Option<Arc<HighlightSink>>,
highlight_field_name: &str,
segment_id: SegmentId,
doc_id: DocId,
) -> u32 {
let (folded_text, byte_map) = fold_with_byte_map(stored_text);
let regex_matches: Vec<regex::Match> = params.compiled.find_iter(&folded_text).collect();
let tf_regex = regex_matches.len() as u32;
let tf_fuzzy = if params.fuzzy_distance > 0 && !params.literals.is_empty() {
let doc_tokens = tokenize_raw(stored_text);
params
.literals
.iter()
.map(|lit| {
let lit_lower = lit.to_lowercase();
let mut count = 0u32;
for &(start, end) in &doc_tokens {
let doc_token = stored_text[start..end].to_lowercase();
if token_match_distance(&doc_token, &lit_lower, params.fuzzy_distance).is_some()
{
count += 1;
}
}
count
})
.max()
.unwrap_or(0)
} else {
0
};
let tf = max(tf_regex, tf_fuzzy);
if tf > 0 {
if let Some(sink) = highlight_sink {
if tf_regex > 0 {
let offsets: Vec<[usize; 2]> = regex_matches
.iter()
.map(|m| [byte_map[m.start()], byte_map[m.end()]])
.collect();
sink.insert(segment_id, doc_id, highlight_field_name, offsets);
}
}
}
tf
}
struct NgramContainsScorer {
candidates: Vec<DocId>,
cursor: usize,
store_reader: crate::store::StoreReader,
text_field: Field,
verification: VerificationMode,
bm25_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
last_tf: u32,
highlight_sink: Option<Arc<HighlightSink>>,
highlight_field_name: String,
segment_id: SegmentId,
}
impl NgramContainsScorer {
fn new(
candidates: Vec<DocId>,
store_reader: crate::store::StoreReader,
text_field: Field,
verification: VerificationMode,
bm25_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
highlight_sink: Option<Arc<HighlightSink>>,
highlight_field_name: String,
segment_id: SegmentId,
) -> Self {
let mut scorer = NgramContainsScorer {
candidates,
cursor: 0,
store_reader,
text_field,
verification,
bm25_weight,
fieldnorm_reader,
last_tf: 0,
highlight_sink,
highlight_field_name,
segment_id,
};
if scorer.doc() != TERMINATED && !scorer.verify() {
scorer.advance();
}
scorer
}
fn verify(&mut self) -> bool {
self.last_tf = 0;
let doc_id = self.doc();
if doc_id == TERMINATED {
return false;
}
let doc: LucivyDocument = match self.store_reader.get(doc_id) {
Ok(d) => d,
Err(_) => return false,
};
let stored_text = match doc.get_first(self.text_field).and_then(|v| v.as_str()) {
Some(s) => s,
None => {
return false;
}
};
let tf = match &self.verification {
VerificationMode::Fuzzy(params) => {
let doc_tokens = tokenize_raw(stored_text);
if params.tokens.len() == 1 {
count_single_token_fuzzy(
stored_text,
&doc_tokens,
params,
&self.highlight_sink,
&self.highlight_field_name,
self.segment_id,
doc_id,
)
} else {
count_multi_token_fuzzy(
stored_text,
&doc_tokens,
params,
&self.highlight_sink,
&self.highlight_field_name,
self.segment_id,
doc_id,
)
}
}
VerificationMode::Regex(params) => {
verify_regex(
stored_text,
params,
&self.highlight_sink,
&self.highlight_field_name,
self.segment_id,
doc_id,
)
}
};
self.last_tf = tf;
tf > 0
}
}
impl DocSet for NgramContainsScorer {
fn advance(&mut self) -> DocId {
loop {
self.cursor += 1;
let doc = self.doc();
if doc == TERMINATED || self.verify() {
return doc;
}
}
}
fn seek(&mut self, target: DocId) -> DocId {
while self.cursor < self.candidates.len() && self.candidates[self.cursor] < target {
self.cursor += 1;
}
if self.doc() == TERMINATED || self.verify() {
return self.doc();
}
let result = self.advance();
result
}
fn doc(&self) -> DocId {
if self.cursor < self.candidates.len() {
self.candidates[self.cursor]
} else {
TERMINATED
}
}
fn size_hint(&self) -> u32 {
self.candidates.len().saturating_sub(self.cursor) as u32
}
}
impl Scorer for NgramContainsScorer {
fn score(&mut self) -> Score {
let doc = self.doc();
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
self.bm25_weight.score(fieldnorm_id, self.last_tf)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::index::SegmentId;
fn test_seg_id() -> SegmentId {
SegmentId::generate_random()
}
fn make_regex_params(pattern: &str, literals: Vec<&str>, fuzzy_distance: u8) -> RegexParams {
let mut folded = String::new();
crate::tokenizer::to_ascii(pattern, &mut folded);
RegexParams {
compiled: Regex::new(&format!("(?i){folded}")).unwrap(),
literals: literals.into_iter().map(|s| s.to_string()).collect(),
fuzzy_distance,
}
}
#[test]
fn test_regex_pure_match() {
let params = make_regex_params(r"program[a-z]+", vec!["program"], 0);
let tf = verify_regex("Rust is a systems programming language", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 1); }
#[test]
fn test_regex_pure_no_match() {
let params = make_regex_params(r"program[a-z]+", vec!["program"], 0);
let tf = verify_regex("the cat sat on the mat", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 0);
}
#[test]
fn test_regex_pure_multiple_matches() {
let params = make_regex_params(r"program[a-z]+", vec!["program"], 0);
let tf = verify_regex(
"Programming in Rust: a programmer's guide to programming",
¶ms,
&None,
"",
test_seg_id(),
0,
);
assert_eq!(tf, 3); }
#[test]
fn test_regex_case_insensitive() {
let params = make_regex_params(r"rust", vec!["rust"], 0);
let tf = verify_regex("Rust is great", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 1);
}
#[test]
fn test_regex_hybrid_typo_in_pattern() {
let params = make_regex_params(r"programing[a-z]+", vec!["programing"], 1);
let tf = verify_regex("Rust is a systems programming language", ¶ms, &None, "", test_seg_id(), 0);
assert!(tf > 0, "hybrid should match via fuzzy on literal");
}
#[test]
fn test_regex_hybrid_exact_wins() {
let params = make_regex_params(r"program[a-z]+", vec!["program"], 1);
let tf = verify_regex("Rust programming is fun", ¶ms, &None, "", test_seg_id(), 0);
assert!(tf > 0);
}
#[test]
fn test_regex_hybrid_no_match() {
let params = make_regex_params(r"python[a-z]+", vec!["python"], 1);
let tf = verify_regex("Rust is a systems programming language", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 0);
}
#[test]
fn test_regex_highlights() {
let sink = Arc::new(HighlightSink::new());
let sid = test_seg_id();
let params = make_regex_params(r"program[a-z]+", vec!["program"], 0);
let text = "Rust programming is fun";
let tf = verify_regex(text, ¶ms, &Some(sink.clone()), "", sid, 42);
assert_eq!(tf, 1);
let by_field = sink.get(sid, 42).expect("should have highlights");
let offsets = by_field.get("").expect("should have field offsets");
assert_eq!(offsets.len(), 1);
assert_eq!(offsets[0], [5, 16]);
}
#[test]
fn test_regex_empty_text() {
let params = make_regex_params(r"program[a-z]+", vec!["program"], 0);
let tf = verify_regex("", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 0);
}
#[test]
fn test_regex_dot_star() {
let params = make_regex_params(r".*", vec![], 0);
let tf = verify_regex("anything", ¶ms, &None, "", test_seg_id(), 0);
assert!(tf > 0); }
#[test]
fn test_regex_word_boundary() {
let params = make_regex_params(r"\brust\b", vec!["rust"], 0);
let tf = verify_regex("Rust is great but rusty is not", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 1); }
#[test]
fn test_regex_unicode() {
let params = make_regex_params(r"café", vec!["café"], 0);
let tf = verify_regex("I love café au lait", ¶ms, &None, "", test_seg_id(), 0);
assert_eq!(tf, 1);
}
#[test]
fn test_regex_hybrid_fuzzy_only_match() {
let params = make_regex_params(r"databse", vec!["databse"], 1);
let tf = verify_regex("Graph databases store data", ¶ms, &None, "", test_seg_id(), 0);
assert!(tf > 0, "hybrid should match via fuzzy on literal");
}
#[test]
fn test_regex_multiple_highlights() {
let sink = Arc::new(HighlightSink::new());
let sid = test_seg_id();
let params = make_regex_params(r"[a-z]+ing", vec!["ing"], 0);
let text = "programming and testing are fun";
let tf = verify_regex(text, ¶ms, &Some(sink.clone()), "", sid, 99);
assert_eq!(tf, 2); let by_field = sink.get(sid, 99).expect("should have highlights");
let offsets = by_field.get("").expect("should have field offsets");
assert_eq!(offsets.len(), 2);
}
fn make_fuzzy_params(
tokens: Vec<&str>,
separators: Vec<&str>,
prefix: &str,
suffix: &str,
distance: u8,
budget: u32,
) -> FuzzyParams {
FuzzyParams {
tokens: tokens.into_iter().map(|s| s.to_string()).collect(),
separators: separators.into_iter().map(|s| s.to_string()).collect(),
prefix: prefix.to_string(),
suffix: suffix.to_string(),
fuzzy_distance: distance,
distance_budget: budget,
strict_separators: true,
}
}
#[test]
fn test_fuzzy_single_exact() {
let text = "Rust is a programming language";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["programming"], vec![], "", "", 1, 1);
assert_eq!(
count_single_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
1
);
}
#[test]
fn test_fuzzy_single_typo() {
let text = "Rust is a programming language";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["programing"], vec![], "", "", 1, 1);
assert_eq!(
count_single_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
1
);
}
#[test]
fn test_fuzzy_single_no_match() {
let text = "Rust is a programming language";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["python"], vec![], "", "", 1, 1);
assert_eq!(
count_single_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
0
);
}
#[test]
fn test_fuzzy_single_multiple_matches() {
let text = "programming in Rust: a programmer guide";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["program"], vec![], "", "", 1, 1);
assert_eq!(
count_single_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
2
);
}
#[test]
fn test_fuzzy_single_distance_zero_no_match() {
let text = "Rust is a programming language";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["programing"], vec![], "", "", 0, 0);
assert_eq!(
count_single_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
0
);
}
#[test]
fn test_fuzzy_single_highlights() {
let sink = Arc::new(HighlightSink::new());
let sid = test_seg_id();
let text = "Rust programming is fun";
let tokens = tokenize_raw(text);
let params = make_fuzzy_params(vec!["programming"], vec![], "", "", 1, 1);
let tf =
count_single_token_fuzzy(text, &tokens, ¶ms, &Some(sink.clone()), "", sid, 42);
assert_eq!(tf, 1);
let by_field = sink.get(sid, 42).expect("should have highlights");
let offsets = by_field.get("").expect("should have field offsets");
assert_eq!(offsets.len(), 1);
assert_eq!(offsets[0], [5, 16]); }
#[test]
fn test_fuzzy_multi_exact() {
let text = "Rust is a systems programming language";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["systems", "programming"], vec![" "], "", "", 1, 1);
assert_eq!(
count_multi_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
1
);
}
#[test]
fn test_fuzzy_multi_typo() {
let text = "Rust is a systems programming language";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["sistems", "programing"], vec![" "], "", "", 1, 2);
assert_eq!(
count_multi_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
1
);
}
#[test]
fn test_fuzzy_multi_budget_exceeded() {
let text = "Rust is a systems programming language";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["sistems", "programing"], vec![" "], "", "", 1, 1);
assert_eq!(
count_multi_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
0
);
}
#[test]
fn test_fuzzy_multi_no_match() {
let text = "Rust is a systems programming language";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["machine", "learning"], vec![" "], "", "", 1, 1);
assert_eq!(
count_multi_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
0
);
}
#[test]
fn test_fuzzy_multi_not_enough_tokens() {
let text = "Rust";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["systems", "programming"], vec![" "], "", "", 1, 1);
assert_eq!(
count_multi_token_fuzzy(text, &tokens, ¶ms, &None, "", test_seg_id(), 0),
0
);
}
#[test]
fn test_fuzzy_multi_highlights() {
let sink = Arc::new(HighlightSink::new());
let sid = test_seg_id();
let text = "Rust is a systems programming language";
let tokens = tokenize_raw(text);
let params =
make_fuzzy_params(vec!["systems", "programming"], vec![" "], "", "", 1, 1);
let tf =
count_multi_token_fuzzy(text, &tokens, ¶ms, &Some(sink.clone()), "", sid, 42);
assert_eq!(tf, 1);
let by_field = sink.get(sid, 42).expect("should have highlights");
let offsets = by_field.get("").expect("should have field offsets");
assert_eq!(offsets.len(), 2);
assert_eq!(offsets[0], [10, 17]); assert_eq!(offsets[1], [18, 29]); }
use crate::collector::TopDocs;
use crate::query::boolean_query::BooleanQuery;
use crate::query::Occur;
use crate::schema::{Schema, TextFieldIndexing, TextOptions, STORED};
use crate::tokenizer::NgramTokenizer;
use crate::Index;
fn create_two_field_index(docs: &[(&str, &str)]) -> crate::Result<Index> {
let mut sb = Schema::builder();
let raw_opts = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(crate::schema::IndexRecordOption::WithFreqsAndPositionsAndOffsets),
)
.set_stored();
let ngram_opts = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("ngram3")
.set_index_option(crate::schema::IndexRecordOption::WithFreqs),
);
let title_raw = sb.add_text_field("_title", raw_opts.clone());
let title_ngram = sb.add_text_field("_title._ngram", ngram_opts.clone());
let content_raw = sb.add_text_field("_content", raw_opts);
let content_ngram = sb.add_text_field("_content._ngram", ngram_opts);
let schema = sb.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("ngram3", NgramTokenizer::all_ngrams(3, 3).unwrap());
let mut writer = index.writer_for_tests()?;
for &(title, content) in docs {
let mut doc = crate::LucivyDocument::new();
doc.add_text(title_raw, title);
doc.add_text(title_ngram, title);
doc.add_text(content_raw, content);
doc.add_text(content_ngram, content);
writer.add_document(doc)?;
}
writer.commit()?;
Ok(index)
}
#[test]
fn test_boolean_multi_field_highlights_not_lost() -> crate::Result<()> {
let index = create_two_field_index(&[
("login page", "authentication and authorization"),
])?;
let schema = index.schema();
let title_raw = schema.get_field("_title").unwrap();
let title_ngram = schema.get_field("_title._ngram").unwrap();
let content_raw = schema.get_field("_content").unwrap();
let content_ngram = schema.get_field("_content._ngram").unwrap();
let sink = Arc::new(HighlightSink::new());
let q_title = NgramContainsQuery::new(
title_raw,
title_ngram,
None,
vec!["auth".into()],
VerificationMode::Regex(RegexParams {
compiled: Regex::new("(?i)auth").unwrap(),
literals: vec!["auth".into()],
fuzzy_distance: 0,
}),
)
.with_highlight_sink(sink.clone(), "_title".into());
let q_content = NgramContainsQuery::new(
content_raw,
content_ngram,
None,
vec!["auth".into()],
VerificationMode::Regex(RegexParams {
compiled: Regex::new("(?i)auth").unwrap(),
literals: vec!["auth".into()],
fuzzy_distance: 0,
}),
)
.with_highlight_sink(sink.clone(), "_content".into());
let bool_query = BooleanQuery::new(vec![
(Occur::Should, Box::new(q_title)),
(Occur::Should, Box::new(q_content)),
]);
let reader = index.reader()?;
let searcher = reader.searcher();
let top_docs = searcher.search(&bool_query, &TopDocs::with_limit(10).order_by_score())?;
assert_eq!(top_docs.len(), 1, "should find 1 document");
let (_score, doc_addr) = top_docs[0];
let seg_id = searcher.segment_reader(doc_addr.segment_ord).segment_id();
let by_field = sink.get(seg_id, doc_addr.doc_id)
.expect("highlights should exist for matching document");
assert!(
by_field.contains_key("_content"),
"should have _content highlights, got: {:?}",
by_field.keys().collect::<Vec<_>>()
);
let content_offsets = &by_field["_content"];
assert!(!content_offsets.is_empty(), "content highlights should not be empty");
assert_eq!(content_offsets[0][0], 0);
Ok(())
}
#[test]
fn test_boolean_both_fields_highlighted() -> crate::Result<()> {
let index = create_two_field_index(&[
("source code review", "the source of truth"),
])?;
let schema = index.schema();
let title_raw = schema.get_field("_title").unwrap();
let title_ngram = schema.get_field("_title._ngram").unwrap();
let content_raw = schema.get_field("_content").unwrap();
let content_ngram = schema.get_field("_content._ngram").unwrap();
let sink = Arc::new(HighlightSink::new());
let q_title = NgramContainsQuery::new(
title_raw, title_ngram, None,
vec!["source".into()],
VerificationMode::Regex(RegexParams {
compiled: Regex::new("(?i)source").unwrap(),
literals: vec!["source".into()],
fuzzy_distance: 0,
}),
).with_highlight_sink(sink.clone(), "_title".into());
let q_content = NgramContainsQuery::new(
content_raw, content_ngram, None,
vec!["source".into()],
VerificationMode::Regex(RegexParams {
compiled: Regex::new("(?i)source").unwrap(),
literals: vec!["source".into()],
fuzzy_distance: 0,
}),
).with_highlight_sink(sink.clone(), "_content".into());
let bool_query = BooleanQuery::new(vec![
(Occur::Should, Box::new(q_title)),
(Occur::Should, Box::new(q_content)),
]);
let reader = index.reader()?;
let searcher = reader.searcher();
let top_docs = searcher.search(&bool_query, &TopDocs::with_limit(10).order_by_score())?;
assert_eq!(top_docs.len(), 1);
let (_score, doc_addr) = top_docs[0];
let seg_id = searcher.segment_reader(doc_addr.segment_ord).segment_id();
let by_field = sink.get(seg_id, doc_addr.doc_id)
.expect("highlights should exist");
assert!(
by_field.contains_key("_title"),
"should have _title highlights, got: {:?}", by_field.keys().collect::<Vec<_>>()
);
assert!(
by_field.contains_key("_content"),
"should have _content highlights, got: {:?}", by_field.keys().collect::<Vec<_>>()
);
assert_eq!(by_field["_title"][0], [0, 6]);
assert_eq!(by_field["_content"][0], [4, 10]);
Ok(())
}
}