use crate::core::{DocId, FieldId, NO_MORE_DOCS, Result, ScoreMode, Scorer, TwoPhaseIterator};
use crate::inverted::norms::FieldNormsReader;
use crate::inverted::postings::PositionPostingListReader;
use crate::query::{BoundQuery, BoundSpanQuery, Query, ScorerSupplier, SpanQuery};
use crate::search::bm25::{bm25_idf, bm25_score};
use crate::search::searcher::Searcher;
use crate::segment::reader::SegmentReader;
const NO_MORE_POSITIONS: u32 = u32::MAX;
trait Spans: Send {
fn doc_id(&self) -> DocId;
fn next_doc(&mut self) -> DocId;
fn advance_doc(&mut self, target: DocId) -> DocId;
fn next_start_position(&mut self) -> u32;
fn start_position(&self) -> u32;
fn end_position(&self) -> u32;
fn width(&self) -> u32 {
0
}
}
struct TermSpans<'a> {
reader: PositionPostingListReader<'a>,
pos_index: usize,
current_doc: DocId,
current_tf: u32,
}
unsafe impl Send for TermSpans<'_> {}
impl<'a> TermSpans<'a> {
fn new(reader: PositionPostingListReader<'a>) -> Self {
Self {
reader,
pos_index: 0,
current_doc: NO_MORE_DOCS,
current_tf: 0,
}
}
}
impl Spans for TermSpans<'_> {
fn doc_id(&self) -> DocId {
self.current_doc
}
fn next_doc(&mut self) -> DocId {
self.pos_index = 0;
match self.reader.next_doc() {
Some(doc) => {
self.current_doc = doc;
self.current_tf = self.reader.current_tf();
doc
}
None => {
self.current_doc = NO_MORE_DOCS;
self.current_tf = 0;
NO_MORE_DOCS
}
}
}
fn advance_doc(&mut self, target: DocId) -> DocId {
self.pos_index = 0;
match self.reader.advance(target) {
Some(doc) => {
self.current_doc = doc;
self.current_tf = self.reader.positions().len() as u32;
doc
}
None => {
self.current_doc = NO_MORE_DOCS;
self.current_tf = 0;
NO_MORE_DOCS
}
}
}
fn next_start_position(&mut self) -> u32 {
if self.current_doc == NO_MORE_DOCS {
return NO_MORE_POSITIONS;
}
if self.current_tf == 1 {
if self.pos_index == 0 {
self.pos_index = 1;
return self.reader.first_position();
}
return NO_MORE_POSITIONS;
}
let positions = self.reader.positions();
if self.pos_index < positions.len() {
let pos = positions[self.pos_index];
self.pos_index += 1;
pos
} else {
NO_MORE_POSITIONS
}
}
fn start_position(&self) -> u32 {
if self.pos_index == 0 {
return NO_MORE_POSITIONS;
}
if self.current_tf == 1 {
self.reader.first_position()
} else {
self.reader.positions()[self.pos_index - 1]
}
}
fn end_position(&self) -> u32 {
if self.pos_index == 0 {
return NO_MORE_POSITIONS;
}
self.start_position() + 1
}
}
struct FilterSpans<S: Spans> {
inner: S,
max_end: u32,
}
impl<S: Spans> Spans for FilterSpans<S> {
fn doc_id(&self) -> DocId {
self.inner.doc_id()
}
fn next_doc(&mut self) -> DocId {
self.inner.next_doc()
}
fn advance_doc(&mut self, target: DocId) -> DocId {
self.inner.advance_doc(target)
}
fn next_start_position(&mut self) -> u32 {
let pos = self.inner.next_start_position();
if pos == NO_MORE_POSITIONS {
return NO_MORE_POSITIONS;
}
if self.inner.end_position() > self.max_end {
NO_MORE_POSITIONS
} else {
pos
}
}
fn start_position(&self) -> u32 {
self.inner.start_position()
}
fn end_position(&self) -> u32 {
self.inner.end_position()
}
fn width(&self) -> u32 {
self.inner.width()
}
}
struct NearSpansOrdered<'a> {
sub_spans: Vec<TermSpans<'a>>,
slop: u32,
current_doc: DocId,
match_start: u32,
match_end: u32,
match_width: u32,
first_in_doc: bool,
}
unsafe impl Send for NearSpansOrdered<'_> {}
impl<'a> NearSpansOrdered<'a> {
fn new(sub_spans: Vec<TermSpans<'a>>, slop: u32) -> Self {
Self {
sub_spans,
slop,
current_doc: NO_MORE_DOCS,
match_start: NO_MORE_POSITIONS,
match_end: NO_MORE_POSITIONS,
match_width: 0,
first_in_doc: false,
}
}
fn advance_to_common_doc(&mut self) -> DocId {
if self.sub_spans.is_empty() {
return NO_MORE_DOCS;
}
let mut target = self.sub_spans[0].doc_id();
if target == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
let mut i = 1;
while i < self.sub_spans.len() {
let doc = self.sub_spans[i].doc_id();
if doc == target {
i += 1;
continue;
}
if doc == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
if doc < target {
let new_doc = self.sub_spans[i].advance_doc(target);
if new_doc == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
if new_doc > target {
target = new_doc;
let d0 = self.sub_spans[0].advance_doc(target);
if d0 == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
target = d0;
i = 1; continue;
}
i += 1;
} else {
target = doc;
let d0 = self.sub_spans[0].advance_doc(target);
if d0 == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
target = d0;
i = 1;
}
}
target
}
fn stretch_to_order(&mut self) -> bool {
self.match_start = self.sub_spans[0].start_position();
if self.match_start == NO_MORE_POSITIONS {
return false;
}
self.match_width = 0;
for i in 1..self.sub_spans.len() {
let prev_end = self.sub_spans[i - 1].end_position();
while self.sub_spans[i].start_position() < prev_end {
if self.sub_spans[i].next_start_position() == NO_MORE_POSITIONS {
return false;
}
}
let gap = self.sub_spans[i].start_position() - prev_end;
self.match_width += gap;
}
self.match_end = self.sub_spans.last().unwrap().end_position();
self.match_width <= self.slop
}
fn find_next_match_in_doc(&mut self) -> bool {
loop {
if !self.stretch_to_order() {
return false;
}
if self.match_width <= self.slop {
return true;
}
if self.sub_spans[0].next_start_position() == NO_MORE_POSITIONS {
return false;
}
self.match_start = self.sub_spans[0].start_position();
}
}
}
impl Spans for NearSpansOrdered<'_> {
fn doc_id(&self) -> DocId {
self.current_doc
}
fn next_doc(&mut self) -> DocId {
let next = self.sub_spans[0].next_doc();
if next == NO_MORE_DOCS {
self.current_doc = NO_MORE_DOCS;
return NO_MORE_DOCS;
}
for i in 1..self.sub_spans.len() {
self.sub_spans[i].next_doc();
}
self.current_doc = self.advance_to_common_doc();
self.first_in_doc = true;
self.current_doc
}
fn advance_doc(&mut self, target: DocId) -> DocId {
for s in &mut self.sub_spans {
s.advance_doc(target);
}
self.current_doc = self.advance_to_common_doc();
self.first_in_doc = true;
self.current_doc
}
fn next_start_position(&mut self) -> u32 {
if self.current_doc == NO_MORE_DOCS {
return NO_MORE_POSITIONS;
}
if self.first_in_doc {
self.first_in_doc = false;
for s in &mut self.sub_spans {
if s.next_start_position() == NO_MORE_POSITIONS {
return NO_MORE_POSITIONS;
}
}
} else {
if self.sub_spans[0].next_start_position() == NO_MORE_POSITIONS {
return NO_MORE_POSITIONS;
}
}
if self.find_next_match_in_doc() {
self.match_start
} else {
NO_MORE_POSITIONS
}
}
fn start_position(&self) -> u32 {
self.match_start
}
fn end_position(&self) -> u32 {
self.match_end
}
fn width(&self) -> u32 {
self.match_width
}
}
struct NearSpansUnordered<'a> {
sub_spans: Vec<TermSpans<'a>>,
slop: u32,
current_doc: DocId,
match_start: u32,
match_end: u32,
match_width: u32,
indices: Vec<usize>,
first_in_doc: bool,
}
unsafe impl Send for NearSpansUnordered<'_> {}
impl<'a> NearSpansUnordered<'a> {
fn new(sub_spans: Vec<TermSpans<'a>>, slop: u32) -> Self {
let n = sub_spans.len();
Self {
sub_spans,
slop,
current_doc: NO_MORE_DOCS,
match_start: NO_MORE_POSITIONS,
match_end: NO_MORE_POSITIONS,
match_width: 0,
indices: vec![0; n],
first_in_doc: false,
}
}
fn advance_to_common_doc(&mut self) -> DocId {
if self.sub_spans.is_empty() {
return NO_MORE_DOCS;
}
let mut target = self.sub_spans[0].doc_id();
if target == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
let mut i = 1;
while i < self.sub_spans.len() {
let doc = self.sub_spans[i].doc_id();
if doc == target {
i += 1;
continue;
}
if doc == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
if doc < target {
let new_doc = self.sub_spans[i].advance_doc(target);
if new_doc == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
if new_doc > target {
target = new_doc;
let d0 = self.sub_spans[0].advance_doc(target);
if d0 == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
target = d0;
i = 1;
continue;
}
i += 1;
} else {
target = doc;
let d0 = self.sub_spans[0].advance_doc(target);
if d0 == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
target = d0;
i = 1;
}
}
target
}
fn get_positions(&self, i: usize) -> Vec<u32> {
let s = &self.sub_spans[i];
if s.current_tf == 1 {
vec![s.reader.first_position()]
} else {
s.reader.positions().to_vec()
}
}
fn find_match_unordered(&mut self) -> bool {
let n = self.sub_spans.len();
let all_positions: Vec<Vec<u32>> = (0..n).map(|i| self.get_positions(i)).collect();
for positions in &all_positions {
if positions.is_empty() {
return false;
}
}
for idx in &mut self.indices {
*idx = 0;
}
let max_span = self.slop + n as u32 - 1;
loop {
let mut min_pos = u32::MAX;
let mut max_pos = 0u32;
let mut min_idx = 0;
for (i, &idx) in self.indices.iter().enumerate() {
if idx >= all_positions[i].len() {
return false;
}
let pos = all_positions[i][idx];
if pos < min_pos {
min_pos = pos;
min_idx = i;
}
if pos > max_pos {
max_pos = pos;
}
}
let window = max_pos - min_pos;
if window <= max_span {
self.match_start = min_pos;
self.match_end = max_pos + 1;
self.match_width = window - (n as u32 - 1); return true;
}
self.indices[min_idx] += 1;
if self.indices[min_idx] >= all_positions[min_idx].len() {
return false;
}
}
}
}
impl Spans for NearSpansUnordered<'_> {
fn doc_id(&self) -> DocId {
self.current_doc
}
fn next_doc(&mut self) -> DocId {
let next = self.sub_spans[0].next_doc();
if next == NO_MORE_DOCS {
self.current_doc = NO_MORE_DOCS;
return NO_MORE_DOCS;
}
for i in 1..self.sub_spans.len() {
self.sub_spans[i].next_doc();
}
self.current_doc = self.advance_to_common_doc();
self.first_in_doc = true;
self.current_doc
}
fn advance_doc(&mut self, target: DocId) -> DocId {
for s in &mut self.sub_spans {
s.advance_doc(target);
}
self.current_doc = self.advance_to_common_doc();
self.first_in_doc = true;
self.current_doc
}
fn next_start_position(&mut self) -> u32 {
if self.current_doc == NO_MORE_DOCS {
return NO_MORE_POSITIONS;
}
if self.first_in_doc {
self.first_in_doc = false;
if self.find_match_unordered() {
return self.match_start;
}
return NO_MORE_POSITIONS;
}
NO_MORE_POSITIONS
}
fn start_position(&self) -> u32 {
self.match_start
}
fn end_position(&self) -> u32 {
self.match_end
}
fn width(&self) -> u32 {
self.match_width
}
}
pub struct SpanNotQuery {
pub(crate) include: Box<dyn SpanQuery>,
pub(crate) exclude: Box<dyn SpanQuery>,
}
impl Query for SpanNotQuery {
fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
}
}
impl SpanQuery for SpanNotQuery {
fn bind_span(
&self,
searcher: &Searcher,
score_mode: ScoreMode,
) -> Result<Box<dyn BoundSpanQuery>> {
let include_weight = self.include.bind_span(searcher, score_mode)?;
let exclude_weight = self.exclude.bind_span(searcher, score_mode)?;
Ok(Box::new(BoundSpanNotQuery {
include_weight,
exclude_weight,
}))
}
}
struct BoundSpanNotQuery {
include_weight: Box<dyn BoundSpanQuery>,
exclude_weight: Box<dyn BoundSpanQuery>,
}
impl BoundQuery for BoundSpanNotQuery {
fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
let include = match self.include_weight.scorer_supplier(reader)? {
Some(s) => s,
None => return Ok(None),
};
let exclude = self.exclude_weight.scorer_supplier(reader)?;
Ok(Some(Box::new(SpanNotScorerSupplier { include, exclude })))
}
}
impl BoundSpanQuery for BoundSpanNotQuery {
fn span_scorer_supplier(
&self,
reader: &SegmentReader,
max_end: u32,
) -> Result<Option<Box<dyn ScorerSupplier>>> {
let include = match self.include_weight.span_scorer_supplier(reader, max_end)? {
Some(s) => s,
None => return Ok(None),
};
let exclude = self.exclude_weight.scorer_supplier(reader)?;
Ok(Some(Box::new(SpanNotScorerSupplier { include, exclude })))
}
}
struct SpanNotScorerSupplier {
include: Box<dyn ScorerSupplier>,
exclude: Option<Box<dyn ScorerSupplier>>,
}
impl ScorerSupplier for SpanNotScorerSupplier {
fn cost(&self) -> u64 {
self.include.cost()
}
fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
let include = self.include.scorer()?;
let exclude = match self.exclude {
Some(e) => Some(e.scorer()?),
None => None,
};
let mut scorer = SpanNotScorer { include, exclude };
scorer.find_next_non_excluded();
Ok(Box::new(scorer))
}
}
struct SpanNotScorer {
include: Box<dyn Scorer>,
exclude: Option<Box<dyn Scorer>>,
}
impl SpanNotScorer {
fn is_excluded(&mut self) -> bool {
let Some(ref mut exc) = self.exclude else {
return false;
};
let doc = self.include.doc_id();
if exc.doc_id() < doc {
exc.advance(doc);
}
exc.doc_id() == doc
}
fn find_next_non_excluded(&mut self) -> DocId {
loop {
let doc = self.include.doc_id();
if doc == NO_MORE_DOCS {
return NO_MORE_DOCS;
}
if !self.is_excluded() {
return doc;
}
self.include.next();
}
}
}
impl Scorer for SpanNotScorer {
fn doc_id(&self) -> DocId {
self.include.doc_id()
}
fn next(&mut self) -> DocId {
self.include.next();
self.find_next_non_excluded()
}
fn advance(&mut self, target: DocId) -> DocId {
self.include.advance(target);
self.find_next_non_excluded()
}
fn score(&mut self) -> f32 {
self.include.score()
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
pub struct SpanFirstQuery {
pub(crate) inner: Box<dyn SpanQuery>,
pub end: u32,
}
impl Query for SpanFirstQuery {
fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
}
}
impl SpanQuery for SpanFirstQuery {
fn bind_span(
&self,
searcher: &Searcher,
score_mode: ScoreMode,
) -> Result<Box<dyn BoundSpanQuery>> {
let inner_weight = self.inner.bind_span(searcher, score_mode)?;
Ok(Box::new(BoundSpanFirstQuery {
inner_weight,
end: self.end,
}))
}
}
struct BoundSpanFirstQuery {
inner_weight: Box<dyn BoundSpanQuery>,
end: u32,
}
impl BoundQuery for BoundSpanFirstQuery {
fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
self.inner_weight.span_scorer_supplier(reader, self.end)
}
}
impl BoundSpanQuery for BoundSpanFirstQuery {
fn span_scorer_supplier(
&self,
reader: &SegmentReader,
max_end: u32,
) -> Result<Option<Box<dyn ScorerSupplier>>> {
self.inner_weight
.span_scorer_supplier(reader, max_end.min(self.end))
}
}
pub struct SpanTermQuery {
pub field: String,
pub value: String,
}
impl Query for SpanTermQuery {
fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
}
}
impl SpanQuery for SpanTermQuery {
fn bind_span(&self, searcher: &Searcher, _: ScoreMode) -> Result<Box<dyn BoundSpanQuery>> {
let total_docs = searcher.total_docs();
let doc_freq = searcher.doc_freq(&self.field, &self.value);
let idf = bm25_idf(total_docs, doc_freq);
let avg_field_length = searcher.avg_field_length(&self.field);
Ok(Box::new(BoundSpanTermQuery {
field: self.field.clone(),
value: self.value.clone(),
idf,
avg_field_length,
}))
}
}
struct BoundSpanTermQuery {
field: String,
value: String,
idf: f32,
avg_field_length: f32,
}
impl BoundQuery for BoundSpanTermQuery {
fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
let field_id = match reader
.header()
.fields
.iter()
.find(|f| f.field_name == self.field)
.map(|f| f.field_id)
{
Some(id) => id,
None => return Ok(None),
};
if reader
.postings_with_positions(field_id, &self.value)
.is_none()
{
return Ok(None);
}
Ok(Some(Box::new(SpanTermScorerSupplier {
segment: reader as *const SegmentReader,
field_id,
value: self.value.clone(),
idf: self.idf,
avg_field_length: self.avg_field_length,
})))
}
}
impl BoundSpanQuery for BoundSpanTermQuery {
fn span_scorer_supplier(
&self,
reader: &SegmentReader,
max_end: u32,
) -> Result<Option<Box<dyn ScorerSupplier>>> {
let field_id = match reader
.header()
.fields
.iter()
.find(|f| f.field_name == self.field)
.map(|f| f.field_id)
{
Some(id) => id,
None => return Ok(None),
};
if reader
.postings_with_positions(field_id, &self.value)
.is_none()
{
return Ok(None);
}
Ok(Some(Box::new(FilteredSpanTermScorerSupplier {
segment: reader as *const SegmentReader,
field_id,
value: self.value.clone(),
idf: self.idf,
avg_field_length: self.avg_field_length,
max_end,
})))
}
}
struct SpanTermScorerSupplier {
segment: *const SegmentReader,
field_id: FieldId,
value: String,
idf: f32,
avg_field_length: f32,
}
unsafe impl Send for SpanTermScorerSupplier {}
impl ScorerSupplier for SpanTermScorerSupplier {
fn cost(&self) -> u64 {
1000
}
fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
let reader = unsafe { &*self.segment };
let pos_reader = reader
.postings_with_positions(self.field_id, &self.value)
.unwrap();
let norms = reader.norms(self.field_id);
let mut spans = TermSpans::new(pos_reader);
spans.next_doc(); Ok(Box::new(SimpleSpanScorer {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
}))
}
}
struct FilteredSpanTermScorerSupplier {
segment: *const SegmentReader,
field_id: FieldId,
value: String,
idf: f32,
avg_field_length: f32,
max_end: u32,
}
unsafe impl Send for FilteredSpanTermScorerSupplier {}
impl ScorerSupplier for FilteredSpanTermScorerSupplier {
fn cost(&self) -> u64 {
1000
}
fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
let reader = unsafe { &*self.segment };
let pos_reader = reader
.postings_with_positions(self.field_id, &self.value)
.unwrap();
let norms = reader.norms(self.field_id);
let mut inner = TermSpans::new(pos_reader);
inner.next_doc();
let spans = FilterSpans {
inner,
max_end: self.max_end,
};
let mut scorer = FilteredSpanTermScorer {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
freq: 0.0,
};
scorer.find_next_matching_doc();
Ok(Box::new(scorer))
}
}
struct FilteredSpanTermScorer<'a> {
spans: FilterSpans<TermSpans<'a>>,
idf: f32,
avg_field_length: f32,
norms: Option<FieldNormsReader<'a>>,
freq: f32,
}
unsafe impl Send for FilteredSpanTermScorer<'_> {}
impl FilteredSpanTermScorer<'_> {
fn find_next_matching_doc(&mut self) -> DocId {
loop {
if self.spans.doc_id() == NO_MORE_DOCS {
self.freq = 0.0;
return NO_MORE_DOCS;
}
let mut freq = 0.0f32;
while self.spans.next_start_position() != NO_MORE_POSITIONS {
freq += 1.0;
}
if freq > 0.0 {
self.freq = freq;
return self.spans.doc_id();
}
self.spans.next_doc();
}
}
}
impl Scorer for FilteredSpanTermScorer<'_> {
fn doc_id(&self) -> DocId {
self.spans.doc_id()
}
fn next(&mut self) -> DocId {
self.spans.next_doc();
self.find_next_matching_doc()
}
fn advance(&mut self, target: DocId) -> DocId {
self.spans.advance_doc(target);
self.find_next_matching_doc()
}
fn score(&mut self) -> f32 {
let dl = self
.norms
.as_ref()
.map(|n| n.norm(self.doc_id()))
.unwrap_or(1.0);
bm25_score(self.idf, self.freq, dl, self.avg_field_length)
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
pub struct SpanNearQuery {
pub field: String,
pub terms: Vec<String>,
pub slop: u32,
pub in_order: bool,
}
impl Query for SpanNearQuery {
fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
}
}
impl SpanQuery for SpanNearQuery {
fn bind_span(&self, searcher: &Searcher, _: ScoreMode) -> Result<Box<dyn BoundSpanQuery>> {
let total_docs = searcher.total_docs();
let idf: f32 = self
.terms
.iter()
.map(|t| bm25_idf(total_docs, searcher.doc_freq(&self.field, t)))
.sum();
let avg_field_length = searcher.avg_field_length(&self.field);
Ok(Box::new(BoundSpanNearQuery {
field: self.field.clone(),
terms: self.terms.clone(),
slop: self.slop,
in_order: self.in_order,
idf,
avg_field_length,
}))
}
}
struct BoundSpanNearQuery {
field: String,
terms: Vec<String>,
slop: u32,
in_order: bool,
idf: f32,
avg_field_length: f32,
}
impl BoundQuery for BoundSpanNearQuery {
fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
let field_id = match reader
.header()
.fields
.iter()
.find(|f| f.field_name == self.field)
.map(|f| f.field_id)
{
Some(id) => id,
None => return Ok(None),
};
for term in &self.terms {
if reader.postings_with_positions(field_id, term).is_none() {
return Ok(None);
}
}
Ok(Some(Box::new(SpanNearScorerSupplier {
segment: reader as *const SegmentReader,
field_id,
terms: self.terms.clone(),
slop: self.slop,
in_order: self.in_order,
idf: self.idf,
avg_field_length: self.avg_field_length,
max_end: None,
})))
}
}
impl BoundSpanQuery for BoundSpanNearQuery {
fn span_scorer_supplier(
&self,
reader: &SegmentReader,
max_end: u32,
) -> Result<Option<Box<dyn ScorerSupplier>>> {
let field_id = match reader
.header()
.fields
.iter()
.find(|f| f.field_name == self.field)
.map(|f| f.field_id)
{
Some(id) => id,
None => return Ok(None),
};
for term in &self.terms {
if reader.postings_with_positions(field_id, term).is_none() {
return Ok(None);
}
}
Ok(Some(Box::new(SpanNearScorerSupplier {
segment: reader as *const SegmentReader,
field_id,
terms: self.terms.clone(),
slop: self.slop,
in_order: self.in_order,
idf: self.idf,
avg_field_length: self.avg_field_length,
max_end: Some(max_end),
})))
}
}
struct SpanNearScorerSupplier {
segment: *const SegmentReader,
field_id: FieldId,
terms: Vec<String>,
slop: u32,
in_order: bool,
idf: f32,
avg_field_length: f32,
max_end: Option<u32>,
}
unsafe impl Send for SpanNearScorerSupplier {}
impl ScorerSupplier for SpanNearScorerSupplier {
fn cost(&self) -> u64 {
1000
}
fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
let reader = unsafe { &*self.segment };
let sub_spans: Vec<TermSpans> = self
.terms
.iter()
.map(|t| TermSpans::new(reader.postings_with_positions(self.field_id, t).unwrap()))
.collect();
let norms = reader.norms(self.field_id);
match (self.in_order, self.max_end) {
(true, None) => {
let mut spans = NearSpansOrdered::new(sub_spans, self.slop);
spans.next_doc();
let mut scorer = TwoPhaseSpanScorer {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
sloppy_freq: 0.0,
};
scorer.find_next_matching_doc();
Ok(Box::new(scorer))
}
(false, None) => {
let mut spans = NearSpansUnordered::new(sub_spans, self.slop);
spans.next_doc();
let mut scorer = TwoPhaseSpanScorerUnordered {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
sloppy_freq: 0.0,
};
scorer.find_next_matching_doc();
Ok(Box::new(scorer))
}
(true, Some(max_end)) => {
let mut inner = NearSpansOrdered::new(sub_spans, self.slop);
inner.next_doc();
let spans = FilterSpans { inner, max_end };
let mut scorer = FilteredNearSpanScorer {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
sloppy_freq: 0.0,
};
scorer.find_next_matching_doc();
Ok(Box::new(scorer))
}
(false, Some(max_end)) => {
let mut inner = NearSpansUnordered::new(sub_spans, self.slop);
inner.next_doc();
let spans = FilterSpans { inner, max_end };
let mut scorer = FilteredNearSpanScorer {
spans,
idf: self.idf,
avg_field_length: self.avg_field_length,
norms,
sloppy_freq: 0.0,
};
scorer.find_next_matching_doc();
Ok(Box::new(scorer))
}
}
}
}
struct FilteredNearSpanScorer<'a, S: Spans> {
spans: FilterSpans<S>,
idf: f32,
avg_field_length: f32,
norms: Option<FieldNormsReader<'a>>,
sloppy_freq: f32,
}
unsafe impl<S: Spans> Send for FilteredNearSpanScorer<'_, S> {}
impl<S: Spans> FilteredNearSpanScorer<'_, S> {
fn find_next_matching_doc(&mut self) -> DocId {
loop {
if self.spans.doc_id() == NO_MORE_DOCS {
self.sloppy_freq = 0.0;
return NO_MORE_DOCS;
}
let mut freq = 0.0f32;
while self.spans.next_start_position() != NO_MORE_POSITIONS {
freq += 1.0 / (1.0 + self.spans.width() as f32);
}
if freq > 0.0 {
self.sloppy_freq = freq;
return self.spans.doc_id();
}
self.spans.next_doc();
}
}
}
impl<S: Spans> Scorer for FilteredNearSpanScorer<'_, S> {
fn doc_id(&self) -> DocId {
self.spans.doc_id()
}
fn next(&mut self) -> DocId {
self.spans.next_doc();
self.find_next_matching_doc()
}
fn advance(&mut self, target: DocId) -> DocId {
self.spans.advance_doc(target);
self.find_next_matching_doc()
}
fn score(&mut self) -> f32 {
let dl = self
.norms
.as_ref()
.map(|n| n.norm(self.doc_id()))
.unwrap_or(1.0);
bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
struct SimpleSpanScorer<'a> {
spans: TermSpans<'a>,
idf: f32,
avg_field_length: f32,
norms: Option<FieldNormsReader<'a>>,
}
unsafe impl Send for SimpleSpanScorer<'_> {}
impl Scorer for SimpleSpanScorer<'_> {
fn doc_id(&self) -> DocId {
self.spans.doc_id()
}
fn next(&mut self) -> DocId {
self.spans.next_doc()
}
fn advance(&mut self, target: DocId) -> DocId {
self.spans.advance_doc(target)
}
fn score(&mut self) -> f32 {
let tf = self.spans.current_tf as f32;
let dl = self
.norms
.as_ref()
.map(|n| n.norm(self.doc_id()))
.unwrap_or(1.0);
bm25_score(self.idf, tf, dl, self.avg_field_length)
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
struct TwoPhaseSpanScorer<'a> {
spans: NearSpansOrdered<'a>,
idf: f32,
avg_field_length: f32,
norms: Option<FieldNormsReader<'a>>,
sloppy_freq: f32,
}
unsafe impl Send for TwoPhaseSpanScorer<'_> {}
impl TwoPhaseSpanScorer<'_> {
fn find_next_matching_doc(&mut self) -> DocId {
loop {
if self.spans.current_doc == NO_MORE_DOCS {
self.sloppy_freq = 0.0;
return NO_MORE_DOCS;
}
let mut freq: f32 = 0.0;
while self.spans.next_start_position() != NO_MORE_POSITIONS {
freq += 1.0 / (1.0 + self.spans.width() as f32);
}
if freq > 0.0 {
self.sloppy_freq = freq;
return self.spans.current_doc;
}
self.spans.next_doc();
}
}
}
impl Scorer for TwoPhaseSpanScorer<'_> {
fn doc_id(&self) -> DocId {
self.spans.doc_id()
}
fn next(&mut self) -> DocId {
self.spans.next_doc();
self.find_next_matching_doc()
}
fn advance(&mut self, target: DocId) -> DocId {
self.spans.advance_doc(target);
self.find_next_matching_doc()
}
fn score(&mut self) -> f32 {
let dl = self
.norms
.as_ref()
.map(|n| n.norm(self.doc_id()))
.unwrap_or(1.0);
bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
struct TwoPhaseSpanScorerUnordered<'a> {
spans: NearSpansUnordered<'a>,
idf: f32,
avg_field_length: f32,
norms: Option<FieldNormsReader<'a>>,
sloppy_freq: f32,
}
unsafe impl Send for TwoPhaseSpanScorerUnordered<'_> {}
impl TwoPhaseSpanScorerUnordered<'_> {
fn find_next_matching_doc(&mut self) -> DocId {
loop {
if self.spans.current_doc == NO_MORE_DOCS {
self.sloppy_freq = 0.0;
return NO_MORE_DOCS;
}
let mut freq: f32 = 0.0;
while self.spans.next_start_position() != NO_MORE_POSITIONS {
freq += 1.0 / (1.0 + self.spans.width() as f32);
}
if freq > 0.0 {
self.sloppy_freq = freq;
return self.spans.current_doc;
}
self.spans.next_doc();
}
}
}
impl Scorer for TwoPhaseSpanScorerUnordered<'_> {
fn doc_id(&self) -> DocId {
self.spans.doc_id()
}
fn next(&mut self) -> DocId {
self.spans.next_doc();
self.find_next_matching_doc()
}
fn advance(&mut self, target: DocId) -> DocId {
self.spans.advance_doc(target);
self.find_next_matching_doc()
}
fn score(&mut self) -> f32 {
let dl = self
.norms
.as_ref()
.map(|n| n.norm(self.doc_id()))
.unwrap_or(1.0);
bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::Token;
use crate::core::SegmentId;
use crate::mapping::{FieldType, Mapping};
use crate::segment::builder::SegmentBuilder;
fn make_tokens(terms: &[&str]) -> Vec<Token> {
terms
.iter()
.enumerate()
.map(|(i, t)| Token::new(*t, 0, t.len(), i as u32))
.collect()
}
fn build_store(docs: &[&[&str]]) -> crate::search::segment_store::SegmentStore {
let schema = Mapping::builder().field("text", FieldType::Text).build();
let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
for terms in docs {
builder.add_document(&[(FieldId::new(0), make_tokens(terms))], b"{}");
}
let reader = SegmentReader::open(builder.build()).unwrap();
crate::search::segment_store::SegmentStore::new(
vec![reader],
crate::analysis::AnalyzerRegistry::new(),
None,
None,
)
}
#[test]
fn span_term_basic() {
let store = build_store(&[
&["the", "quick", "brown", "fox"],
&["the", "lazy", "dog"],
&["quick", "fox"],
]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanTermQuery {
field: "text".into(),
value: "quick".into(),
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 2); }
#[test]
fn span_term_missing() {
let store = build_store(&[&["the", "quick"]]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanTermQuery {
field: "text".into(),
value: "nonexistent".into(),
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 0);
}
#[test]
fn span_near_exact_phrase() {
let store = build_store(&[
&["the", "quick", "brown", "fox"], &["brown", "quick", "fox"], &["quick", "brown"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "brown".into()],
slop: 0,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 2); }
#[test]
fn span_near_with_slop() {
let store = build_store(&[
&["quick", "brown", "fox"], &["quick", "fox"], &["quick", "a", "b", "fox"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 1,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 2); }
#[test]
fn span_near_no_match() {
let store = build_store(&[
&["quick", "a", "b", "c", "fox"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 1,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 0);
}
#[test]
fn span_near_three_terms() {
let store = build_store(&[
&["the", "quick", "brown", "fox"], &["quick", "fox", "brown"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "brown".into(), "fox".into()],
slop: 0,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 1); }
#[test]
fn span_near_wrong_order() {
let store = build_store(&[
&["fox", "quick"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 5,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 0);
}
#[test]
fn span_near_one_term_missing() {
let store = build_store(&[&["quick", "brown"]]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "nonexistent".into()],
slop: 10,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 0);
}
#[test]
fn span_near_unordered_basic() {
let store = build_store(&[
&["the", "fox", "quick"], &["quick", "a", "b", "c", "fox"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 1,
in_order: false,
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 1); }
#[test]
fn span_near_unordered_reversed() {
let store = build_store(&[&["fox", "quick"]]);
let searcher = Searcher::new(&store);
let ordered = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 1,
in_order: true,
},
10,
0,
)
.unwrap();
assert_eq!(ordered.total_hits.value, 0);
let unordered = searcher
.search_query(
&SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "fox".into()],
slop: 0,
in_order: false,
},
10,
0,
)
.unwrap();
assert_eq!(unordered.total_hits.value, 1); }
#[test]
fn span_not_basic() {
let store = build_store(&[
&["quick", "fox"], &["quick", "brown"], &["slow", "dog"], ]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNotQuery {
include: Box::new(SpanTermQuery {
field: "text".into(),
value: "quick".into(),
}),
exclude: Box::new(SpanTermQuery {
field: "text".into(),
value: "fox".into(),
}),
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 1); }
#[test]
fn span_not_no_exclusions() {
let store = build_store(&[&["quick", "fox"], &["quick", "brown"]]);
let searcher = Searcher::new(&store);
let results = searcher
.search_query(
&SpanNotQuery {
include: Box::new(SpanTermQuery {
field: "text".into(),
value: "quick".into(),
}),
exclude: Box::new(SpanTermQuery {
field: "text".into(),
value: "nonexistent".into(),
}),
},
10,
0,
)
.unwrap();
assert_eq!(results.total_hits.value, 2); }
#[test]
fn span_term_score_uses_bm25_tf() {
let store = build_store(&[
&["search", "engine", "search"], &["search", "tools"], ]);
let searcher = Searcher::new(&store);
let query = SpanTermQuery {
field: "text".into(),
value: "search".into(),
};
let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
let supplier = weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut scorer = supplier.scorer().unwrap();
assert_eq!(scorer.doc_id(), DocId::new(0));
let doc0_score = scorer.score();
scorer.next();
assert_eq!(scorer.doc_id(), DocId::new(1));
let doc1_score = scorer.score();
assert!(
doc0_score > doc1_score,
"doc with tf=2 ({doc0_score}) must score higher than doc with tf=1 \
({doc1_score}) — span_term must use BM25 TF, not hardcoded 1.0"
);
}
#[test]
fn span_term_score_matches_term_query() {
let store = build_store(&[&["search", "engine", "search"], &["search", "tools"]]);
let searcher = Searcher::new(&store);
let span_query = SpanTermQuery {
field: "text".into(),
value: "search".into(),
};
let term_query = crate::query::term::TermQuery {
field: "text".into(),
value: "search".into(),
};
let span_weight = span_query.bind(&searcher, ScoreMode::Complete).unwrap();
let span_supplier = span_weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut span_scorer = span_supplier.scorer().unwrap();
let term_weight = term_query.bind(&searcher, ScoreMode::Complete).unwrap();
let term_supplier = term_weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut term_scorer = term_supplier.scorer().unwrap();
for _ in 0..2 {
assert_eq!(span_scorer.doc_id(), term_scorer.doc_id());
let span_score = span_scorer.score();
let term_score = term_scorer.score();
assert!(
(span_score - term_score).abs() < 1e-5,
"span_term score ({span_score}) must equal term query score ({term_score}) \
for doc {:?}",
span_scorer.doc_id()
);
span_scorer.next();
term_scorer.next();
}
}
#[test]
fn span_near_sloppy_freq_penalizes_width() {
let store = build_store(&[
&["quick", "brown", "a", "b", "c"],
&["quick", "a", "b", "brown", "c"],
]);
let searcher = Searcher::new(&store);
let query = SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "brown".into()],
slop: 5,
in_order: true,
};
let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
let supplier = weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut scorer = supplier.scorer().unwrap();
assert_eq!(scorer.doc_id(), DocId::new(0));
let exact_score = scorer.score();
scorer.next();
assert_eq!(scorer.doc_id(), DocId::new(1));
let sloppy_score = scorer.score();
assert!(
exact_score > sloppy_score,
"exact match ({exact_score}) must score higher than sloppy match ({sloppy_score}) — \
sloppy frequency must penalize width"
);
}
#[test]
fn span_near_score_uses_bm25() {
let store = build_store(&[
&["quick", "brown", "and", "quick", "brown", "fox"],
&["quick", "brown", "fox", "and", "lazy", "dog"],
]);
let searcher = Searcher::new(&store);
let query = SpanNearQuery {
field: "text".into(),
terms: vec!["quick".into(), "brown".into()],
slop: 0,
in_order: true,
};
let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
let supplier = weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut scorer = supplier.scorer().unwrap();
assert_eq!(scorer.doc_id(), DocId::new(0));
let doc0_score = scorer.score();
scorer.next();
assert_eq!(scorer.doc_id(), DocId::new(1));
let doc1_score = scorer.score();
assert_ne!(doc0_score, 1.0, "span_near score must not be hardcoded 1.0");
assert!(
doc0_score > doc1_score,
"doc with 2 near matches ({doc0_score}) must score higher than \
doc with 1 near match ({doc1_score})"
);
}
#[test]
fn span_not_delegates_score() {
let store = build_store(&[
&["search", "engine", "search"], &["search", "tools"], ]);
let searcher = Searcher::new(&store);
let query = SpanNotQuery {
include: Box::new(SpanTermQuery {
field: "text".into(),
value: "search".into(),
}),
exclude: Box::new(SpanTermQuery {
field: "text".into(),
value: "lazy".into(),
}),
};
let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
let supplier = weight
.scorer_supplier(&searcher.segments()[0])
.unwrap()
.unwrap();
let mut scorer = supplier.scorer().unwrap();
assert_eq!(scorer.doc_id(), DocId::new(0));
let doc0_score = scorer.score();
scorer.next();
assert_eq!(scorer.doc_id(), DocId::new(1));
let doc1_score = scorer.score();
assert_ne!(doc0_score, 1.0, "span_not score must not be hardcoded 1.0");
assert!(
doc0_score > doc1_score,
"span_not must delegate to include score: doc0 ({doc0_score}) should > doc1 ({doc1_score})"
);
}
}