Skip to main content

luci/query/
span.rs

1//! Span queries: positional matching with configurable distance and ordering.
2//!
3//! Implements the Spans abstraction from [[feature-span-queries]]: each span query
4//! yields `(doc, start, end)` tuples that can be composed by outer queries.
5//!
6//! Two-phase approach: doc-level conjunction first (cheap), position
7//! verification second (expensive). Follows Lucene's ConjunctionSpans pattern.
8
9use crate::core::{DocId, FieldId, NO_MORE_DOCS, Result, ScoreMode, Scorer, TwoPhaseIterator};
10
11use crate::inverted::norms::FieldNormsReader;
12use crate::inverted::postings::PositionPostingListReader;
13use crate::query::{BoundQuery, BoundSpanQuery, Query, ScorerSupplier, SpanQuery};
14use crate::search::bm25::{bm25_idf, bm25_score};
15use crate::search::searcher::Searcher;
16use crate::segment::reader::SegmentReader;
17
18const NO_MORE_POSITIONS: u32 = u32::MAX;
19
20// ---------------------------------------------------------------------------
21// Spans trait
22// ---------------------------------------------------------------------------
23
24/// Position-level iterator over spans within documents.
25///
26/// Contract:
27/// 1. Call `next_doc()` or `advance_doc()` to move to a document.
28/// 2. Call `next_start_position()` to iterate spans within the doc.
29///    Returns `NO_MORE_POSITIONS` when exhausted for this doc.
30/// 3. After `next_start_position()` returns a valid value, call
31///    `start_position()` and `end_position()` to read the span.
32trait Spans: Send {
33    fn doc_id(&self) -> DocId;
34    fn next_doc(&mut self) -> DocId;
35    fn advance_doc(&mut self, target: DocId) -> DocId;
36
37    /// Advance to next span in current doc. Returns start position,
38    /// or NO_MORE_POSITIONS if no more spans in this doc.
39    fn next_start_position(&mut self) -> u32;
40
41    fn start_position(&self) -> u32;
42    fn end_position(&self) -> u32;
43
44    /// Width of the current span match (gap positions used by slop).
45    /// Used by sloppy frequency: each match contributes
46    /// `1.0 / (1.0 + width)` to BM25 TF. Single-term spans have width=0.
47    /// See [[investigation-20260405-05-span-not-constant-score]].
48    fn width(&self) -> u32 {
49        0
50    }
51}
52
53// ---------------------------------------------------------------------------
54// TermSpans — single term, one span per position
55// ---------------------------------------------------------------------------
56
57struct TermSpans<'a> {
58    reader: PositionPostingListReader<'a>,
59    pos_index: usize,
60    current_doc: DocId,
61    /// Cached TF for the current doc (from next_doc()).
62    current_tf: u32,
63}
64
65unsafe impl Send for TermSpans<'_> {}
66
67impl<'a> TermSpans<'a> {
68    fn new(reader: PositionPostingListReader<'a>) -> Self {
69        Self {
70            reader,
71            pos_index: 0,
72            current_doc: NO_MORE_DOCS,
73            current_tf: 0,
74        }
75    }
76}
77
78impl Spans for TermSpans<'_> {
79    fn doc_id(&self) -> DocId {
80        self.current_doc
81    }
82
83    fn next_doc(&mut self) -> DocId {
84        self.pos_index = 0;
85        match self.reader.next_doc() {
86            Some(doc) => {
87                self.current_doc = doc;
88                self.current_tf = self.reader.current_tf();
89                doc
90            }
91            None => {
92                self.current_doc = NO_MORE_DOCS;
93                self.current_tf = 0;
94                NO_MORE_DOCS
95            }
96        }
97    }
98
99    fn advance_doc(&mut self, target: DocId) -> DocId {
100        self.pos_index = 0;
101        match self.reader.advance(target) {
102            Some(doc) => {
103                self.current_doc = doc;
104                // advance() always fills position_buf
105                self.current_tf = self.reader.positions().len() as u32;
106                doc
107            }
108            None => {
109                self.current_doc = NO_MORE_DOCS;
110                self.current_tf = 0;
111                NO_MORE_DOCS
112            }
113        }
114    }
115
116    fn next_start_position(&mut self) -> u32 {
117        if self.current_doc == NO_MORE_DOCS {
118            return NO_MORE_POSITIONS;
119        }
120
121        if self.current_tf == 1 {
122            // TF=1: single position from cached first_position
123            if self.pos_index == 0 {
124                self.pos_index = 1;
125                return self.reader.first_position();
126            }
127            return NO_MORE_POSITIONS;
128        }
129
130        // TF>1: positions in position_buf
131        let positions = self.reader.positions();
132        if self.pos_index < positions.len() {
133            let pos = positions[self.pos_index];
134            self.pos_index += 1;
135            pos
136        } else {
137            NO_MORE_POSITIONS
138        }
139    }
140
141    fn start_position(&self) -> u32 {
142        if self.pos_index == 0 {
143            return NO_MORE_POSITIONS;
144        }
145        if self.current_tf == 1 {
146            self.reader.first_position()
147        } else {
148            self.reader.positions()[self.pos_index - 1]
149        }
150    }
151
152    fn end_position(&self) -> u32 {
153        if self.pos_index == 0 {
154            return NO_MORE_POSITIONS;
155        }
156        self.start_position() + 1
157    }
158}
159
160// ---------------------------------------------------------------------------
161// FilterSpans — position-filter wrapper used by SpanFirst
162// ---------------------------------------------------------------------------
163
164/// Wraps any Spans iterator, rejecting spans whose end_position exceeds
165/// ``max_end``. Positions within a doc are monotonically non-decreasing,
166/// so once a span's end_position exceeds the limit we return
167/// ``NO_MORE_POSITIONS`` (Lucene's ``NO_MORE_IN_CURRENT_DOC`` contract —
168/// see the FilterSpans reference implementation).
169struct FilterSpans<S: Spans> {
170    inner: S,
171    max_end: u32,
172}
173
174impl<S: Spans> Spans for FilterSpans<S> {
175    fn doc_id(&self) -> DocId {
176        self.inner.doc_id()
177    }
178    fn next_doc(&mut self) -> DocId {
179        self.inner.next_doc()
180    }
181    fn advance_doc(&mut self, target: DocId) -> DocId {
182        self.inner.advance_doc(target)
183    }
184    fn next_start_position(&mut self) -> u32 {
185        let pos = self.inner.next_start_position();
186        if pos == NO_MORE_POSITIONS {
187            return NO_MORE_POSITIONS;
188        }
189        if self.inner.end_position() > self.max_end {
190            // Positions are ascending; once out of range, no more in this doc.
191            NO_MORE_POSITIONS
192        } else {
193            pos
194        }
195    }
196    fn start_position(&self) -> u32 {
197        self.inner.start_position()
198    }
199    fn end_position(&self) -> u32 {
200        self.inner.end_position()
201    }
202    fn width(&self) -> u32 {
203        self.inner.width()
204    }
205}
206
207// ---------------------------------------------------------------------------
208// NearSpansOrdered — ordered proximity matching
209// ---------------------------------------------------------------------------
210
211/// Finds documents where all sub-spans appear in order within `slop` gaps.
212struct NearSpansOrdered<'a> {
213    sub_spans: Vec<TermSpans<'a>>,
214    slop: u32,
215    current_doc: DocId,
216    match_start: u32,
217    match_end: u32,
218    match_width: u32,
219    /// Whether we've found a match in the current doc and need to
220    /// find the next one on the next call to next_start_position().
221    first_in_doc: bool,
222}
223
224unsafe impl Send for NearSpansOrdered<'_> {}
225
226impl<'a> NearSpansOrdered<'a> {
227    fn new(sub_spans: Vec<TermSpans<'a>>, slop: u32) -> Self {
228        Self {
229            sub_spans,
230            slop,
231            current_doc: NO_MORE_DOCS,
232            match_start: NO_MORE_POSITIONS,
233            match_end: NO_MORE_POSITIONS,
234            match_width: 0,
235            first_in_doc: false,
236        }
237    }
238
239    /// Advance all sub-spans to the same document using conjunction.
240    /// Returns the common doc ID, or NO_MORE_DOCS if no more common docs.
241    fn advance_to_common_doc(&mut self) -> DocId {
242        if self.sub_spans.is_empty() {
243            return NO_MORE_DOCS;
244        }
245
246        // Start from the first sub-span's current doc
247        let mut target = self.sub_spans[0].doc_id();
248        if target == NO_MORE_DOCS {
249            return NO_MORE_DOCS;
250        }
251
252        let mut i = 1;
253        while i < self.sub_spans.len() {
254            let doc = self.sub_spans[i].doc_id();
255            if doc == target {
256                i += 1;
257                continue;
258            }
259            if doc == NO_MORE_DOCS {
260                return NO_MORE_DOCS;
261            }
262            if doc < target {
263                // Advance this sub-span to target
264                let new_doc = self.sub_spans[i].advance_doc(target);
265                if new_doc == NO_MORE_DOCS {
266                    return NO_MORE_DOCS;
267                }
268                if new_doc > target {
269                    // Overshot — restart from sub_spans[0]
270                    target = new_doc;
271                    // Advance sub_spans[0] to new target
272                    let d0 = self.sub_spans[0].advance_doc(target);
273                    if d0 == NO_MORE_DOCS {
274                        return NO_MORE_DOCS;
275                    }
276                    target = d0;
277                    i = 1; // restart alignment
278                    continue;
279                }
280                i += 1;
281            } else {
282                // doc > target — advance sub_spans[0] and restart
283                target = doc;
284                let d0 = self.sub_spans[0].advance_doc(target);
285                if d0 == NO_MORE_DOCS {
286                    return NO_MORE_DOCS;
287                }
288                target = d0;
289                i = 1;
290            }
291        }
292        target
293    }
294
295    /// Try to form an ordered chain of sub-spans within slop.
296    /// Returns true if a match is found, setting match_start/end/width.
297    fn stretch_to_order(&mut self) -> bool {
298        self.match_start = self.sub_spans[0].start_position();
299        if self.match_start == NO_MORE_POSITIONS {
300            return false;
301        }
302        self.match_width = 0;
303
304        for i in 1..self.sub_spans.len() {
305            let prev_end = self.sub_spans[i - 1].end_position();
306
307            // Advance sub_span[i] so start >= prev_end (non-overlapping, ordered)
308            while self.sub_spans[i].start_position() < prev_end {
309                if self.sub_spans[i].next_start_position() == NO_MORE_POSITIONS {
310                    return false;
311                }
312            }
313
314            let gap = self.sub_spans[i].start_position() - prev_end;
315            self.match_width += gap;
316        }
317
318        self.match_end = self.sub_spans.last().unwrap().end_position();
319        self.match_width <= self.slop
320    }
321
322    /// Find the next position match in the current document.
323    fn find_next_match_in_doc(&mut self) -> bool {
324        loop {
325            if !self.stretch_to_order() {
326                return false;
327            }
328            if self.match_width <= self.slop {
329                return true;
330            }
331            // Width too large — advance first sub-span and retry
332            if self.sub_spans[0].next_start_position() == NO_MORE_POSITIONS {
333                return false;
334            }
335            self.match_start = self.sub_spans[0].start_position();
336        }
337    }
338}
339
340impl Spans for NearSpansOrdered<'_> {
341    fn doc_id(&self) -> DocId {
342        self.current_doc
343    }
344
345    fn next_doc(&mut self) -> DocId {
346        // Advance all sub-spans to next doc
347        let next = self.sub_spans[0].next_doc();
348        if next == NO_MORE_DOCS {
349            self.current_doc = NO_MORE_DOCS;
350            return NO_MORE_DOCS;
351        }
352        for i in 1..self.sub_spans.len() {
353            self.sub_spans[i].next_doc();
354        }
355        self.current_doc = self.advance_to_common_doc();
356        self.first_in_doc = true;
357        self.current_doc
358    }
359
360    fn advance_doc(&mut self, target: DocId) -> DocId {
361        for s in &mut self.sub_spans {
362            s.advance_doc(target);
363        }
364        self.current_doc = self.advance_to_common_doc();
365        self.first_in_doc = true;
366        self.current_doc
367    }
368
369    fn next_start_position(&mut self) -> u32 {
370        if self.current_doc == NO_MORE_DOCS {
371            return NO_MORE_POSITIONS;
372        }
373
374        if self.first_in_doc {
375            self.first_in_doc = false;
376            // Initialize position iteration for all sub-spans
377            for s in &mut self.sub_spans {
378                if s.next_start_position() == NO_MORE_POSITIONS {
379                    return NO_MORE_POSITIONS;
380                }
381            }
382        } else {
383            // Advance the first sub-span to find the next match
384            if self.sub_spans[0].next_start_position() == NO_MORE_POSITIONS {
385                return NO_MORE_POSITIONS;
386            }
387        }
388
389        if self.find_next_match_in_doc() {
390            self.match_start
391        } else {
392            NO_MORE_POSITIONS
393        }
394    }
395
396    fn start_position(&self) -> u32 {
397        self.match_start
398    }
399    fn end_position(&self) -> u32 {
400        self.match_end
401    }
402    fn width(&self) -> u32 {
403        self.match_width
404    }
405}
406
407// ---------------------------------------------------------------------------
408// NearSpansUnordered — unordered proximity matching via sliding window
409// ---------------------------------------------------------------------------
410
411/// Finds documents where all sub-spans appear within `slop` total gap
412/// positions of each other, in any order. Uses a sliding window approach
413/// over position arrays.
414struct NearSpansUnordered<'a> {
415    sub_spans: Vec<TermSpans<'a>>,
416    slop: u32,
417    current_doc: DocId,
418    match_start: u32,
419    match_end: u32,
420    match_width: u32,
421    /// Per-term position indices for the sliding window.
422    indices: Vec<usize>,
423    first_in_doc: bool,
424}
425
426unsafe impl Send for NearSpansUnordered<'_> {}
427
428impl<'a> NearSpansUnordered<'a> {
429    fn new(sub_spans: Vec<TermSpans<'a>>, slop: u32) -> Self {
430        let n = sub_spans.len();
431        Self {
432            sub_spans,
433            slop,
434            current_doc: NO_MORE_DOCS,
435            match_start: NO_MORE_POSITIONS,
436            match_end: NO_MORE_POSITIONS,
437            match_width: 0,
438            indices: vec![0; n],
439            first_in_doc: false,
440        }
441    }
442
443    /// Reuse the same conjunction logic as ordered.
444    fn advance_to_common_doc(&mut self) -> DocId {
445        if self.sub_spans.is_empty() {
446            return NO_MORE_DOCS;
447        }
448        let mut target = self.sub_spans[0].doc_id();
449        if target == NO_MORE_DOCS {
450            return NO_MORE_DOCS;
451        }
452        let mut i = 1;
453        while i < self.sub_spans.len() {
454            let doc = self.sub_spans[i].doc_id();
455            if doc == target {
456                i += 1;
457                continue;
458            }
459            if doc == NO_MORE_DOCS {
460                return NO_MORE_DOCS;
461            }
462            if doc < target {
463                let new_doc = self.sub_spans[i].advance_doc(target);
464                if new_doc == NO_MORE_DOCS {
465                    return NO_MORE_DOCS;
466                }
467                if new_doc > target {
468                    target = new_doc;
469                    let d0 = self.sub_spans[0].advance_doc(target);
470                    if d0 == NO_MORE_DOCS {
471                        return NO_MORE_DOCS;
472                    }
473                    target = d0;
474                    i = 1;
475                    continue;
476                }
477                i += 1;
478            } else {
479                target = doc;
480                let d0 = self.sub_spans[0].advance_doc(target);
481                if d0 == NO_MORE_DOCS {
482                    return NO_MORE_DOCS;
483                }
484                target = d0;
485                i = 1;
486            }
487        }
488        target
489    }
490
491    /// Get positions for sub-span i in the current doc.
492    fn get_positions(&self, i: usize) -> Vec<u32> {
493        let s = &self.sub_spans[i];
494        if s.current_tf == 1 {
495            vec![s.reader.first_position()]
496        } else {
497            s.reader.positions().to_vec()
498        }
499    }
500
501    /// Sliding window: find a combination of positions (one per term) where
502    /// the total gap fits within slop. Uses the min-advance approach.
503    fn find_match_unordered(&mut self) -> bool {
504        let n = self.sub_spans.len();
505        let all_positions: Vec<Vec<u32>> = (0..n).map(|i| self.get_positions(i)).collect();
506
507        // Check all position lists are non-empty
508        for positions in &all_positions {
509            if positions.is_empty() {
510                return false;
511            }
512        }
513
514        // Initialize indices to 0
515        for idx in &mut self.indices {
516            *idx = 0;
517        }
518
519        let max_span = self.slop + n as u32 - 1;
520
521        loop {
522            let mut min_pos = u32::MAX;
523            let mut max_pos = 0u32;
524            let mut min_idx = 0;
525
526            for (i, &idx) in self.indices.iter().enumerate() {
527                if idx >= all_positions[i].len() {
528                    return false;
529                }
530                let pos = all_positions[i][idx];
531                if pos < min_pos {
532                    min_pos = pos;
533                    min_idx = i;
534                }
535                if pos > max_pos {
536                    max_pos = pos;
537                }
538            }
539
540            let window = max_pos - min_pos;
541            if window <= max_span {
542                self.match_start = min_pos;
543                self.match_end = max_pos + 1;
544                self.match_width = window - (n as u32 - 1); // total gap
545                return true;
546            }
547
548            // Advance the minimum
549            self.indices[min_idx] += 1;
550            if self.indices[min_idx] >= all_positions[min_idx].len() {
551                return false;
552            }
553        }
554    }
555}
556
557impl Spans for NearSpansUnordered<'_> {
558    fn doc_id(&self) -> DocId {
559        self.current_doc
560    }
561
562    fn next_doc(&mut self) -> DocId {
563        let next = self.sub_spans[0].next_doc();
564        if next == NO_MORE_DOCS {
565            self.current_doc = NO_MORE_DOCS;
566            return NO_MORE_DOCS;
567        }
568        for i in 1..self.sub_spans.len() {
569            self.sub_spans[i].next_doc();
570        }
571        self.current_doc = self.advance_to_common_doc();
572        self.first_in_doc = true;
573        self.current_doc
574    }
575
576    fn advance_doc(&mut self, target: DocId) -> DocId {
577        for s in &mut self.sub_spans {
578            s.advance_doc(target);
579        }
580        self.current_doc = self.advance_to_common_doc();
581        self.first_in_doc = true;
582        self.current_doc
583    }
584
585    fn next_start_position(&mut self) -> u32 {
586        if self.current_doc == NO_MORE_DOCS {
587            return NO_MORE_POSITIONS;
588        }
589        if self.first_in_doc {
590            self.first_in_doc = false;
591            if self.find_match_unordered() {
592                return self.match_start;
593            }
594            return NO_MORE_POSITIONS;
595        }
596        // For subsequent matches in same doc, advance past current match
597        // This is simplified — we only return the first match per doc.
598        NO_MORE_POSITIONS
599    }
600
601    fn start_position(&self) -> u32 {
602        self.match_start
603    }
604    fn end_position(&self) -> u32 {
605        self.match_end
606    }
607    fn width(&self) -> u32 {
608        self.match_width
609    }
610}
611
612// ---------------------------------------------------------------------------
613// SpanNotQuery — exclude overlapping spans
614// ---------------------------------------------------------------------------
615
616pub struct SpanNotQuery {
617    pub(crate) include: Box<dyn SpanQuery>,
618    pub(crate) exclude: Box<dyn SpanQuery>,
619}
620
621impl Query for SpanNotQuery {
622    fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
623        // Trait upcast Box<dyn BoundSpanQuery> → Box<dyn BoundQuery>.
624        Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
625    }
626}
627
628impl SpanQuery for SpanNotQuery {
629    fn bind_span(
630        &self,
631        searcher: &Searcher,
632        score_mode: ScoreMode,
633    ) -> Result<Box<dyn BoundSpanQuery>> {
634        let include_weight = self.include.bind_span(searcher, score_mode)?;
635        let exclude_weight = self.exclude.bind_span(searcher, score_mode)?;
636        Ok(Box::new(BoundSpanNotQuery {
637            include_weight,
638            exclude_weight,
639        }))
640    }
641}
642
643struct BoundSpanNotQuery {
644    include_weight: Box<dyn BoundSpanQuery>,
645    exclude_weight: Box<dyn BoundSpanQuery>,
646}
647
648impl BoundQuery for BoundSpanNotQuery {
649    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
650        let include = match self.include_weight.scorer_supplier(reader)? {
651            Some(s) => s,
652            None => return Ok(None),
653        };
654        let exclude = self.exclude_weight.scorer_supplier(reader)?;
655        Ok(Some(Box::new(SpanNotScorerSupplier { include, exclude })))
656    }
657}
658
659impl BoundSpanQuery for BoundSpanNotQuery {
660    fn span_scorer_supplier(
661        &self,
662        reader: &SegmentReader,
663        max_end: u32,
664    ) -> Result<Option<Box<dyn ScorerSupplier>>> {
665        // SpanNot under SpanFirst: propagate end to include; exclude
666        // is a doc-level filter and doesn't need the constraint.
667        let include = match self.include_weight.span_scorer_supplier(reader, max_end)? {
668            Some(s) => s,
669            None => return Ok(None),
670        };
671        let exclude = self.exclude_weight.scorer_supplier(reader)?;
672        Ok(Some(Box::new(SpanNotScorerSupplier { include, exclude })))
673    }
674}
675
676struct SpanNotScorerSupplier {
677    include: Box<dyn ScorerSupplier>,
678    exclude: Option<Box<dyn ScorerSupplier>>,
679}
680
681impl ScorerSupplier for SpanNotScorerSupplier {
682    fn cost(&self) -> u64 {
683        self.include.cost()
684    }
685    fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
686        let include = self.include.scorer()?;
687        let exclude = match self.exclude {
688            Some(e) => Some(e.scorer()?),
689            None => None,
690        };
691        let mut scorer = SpanNotScorer { include, exclude };
692        scorer.find_next_non_excluded();
693        Ok(Box::new(scorer))
694    }
695}
696
697/// Wraps an include scorer, filtering out docs that also match the exclude.
698struct SpanNotScorer {
699    include: Box<dyn Scorer>,
700    exclude: Option<Box<dyn Scorer>>,
701}
702
703impl SpanNotScorer {
704    fn is_excluded(&mut self) -> bool {
705        let Some(ref mut exc) = self.exclude else {
706            return false;
707        };
708        let doc = self.include.doc_id();
709        if exc.doc_id() < doc {
710            exc.advance(doc);
711        }
712        exc.doc_id() == doc
713    }
714
715    fn find_next_non_excluded(&mut self) -> DocId {
716        loop {
717            let doc = self.include.doc_id();
718            if doc == NO_MORE_DOCS {
719                return NO_MORE_DOCS;
720            }
721            if !self.is_excluded() {
722                return doc;
723            }
724            self.include.next();
725        }
726    }
727}
728
729impl Scorer for SpanNotScorer {
730    fn doc_id(&self) -> DocId {
731        self.include.doc_id()
732    }
733    fn next(&mut self) -> DocId {
734        self.include.next();
735        self.find_next_non_excluded()
736    }
737    fn advance(&mut self, target: DocId) -> DocId {
738        self.include.advance(target);
739        self.find_next_non_excluded()
740    }
741    fn score(&mut self) -> f32 {
742        // exclude only filters matching; scoring comes from the include scorer.
743        self.include.score()
744    }
745    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
746        None
747    }
748}
749
750// ---------------------------------------------------------------------------
751// SpanFirstQuery — match spans starting within first N positions
752// ---------------------------------------------------------------------------
753
754pub struct SpanFirstQuery {
755    pub(crate) inner: Box<dyn SpanQuery>,
756    pub end: u32,
757}
758
759impl Query for SpanFirstQuery {
760    fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
761        Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
762    }
763}
764
765impl SpanQuery for SpanFirstQuery {
766    fn bind_span(
767        &self,
768        searcher: &Searcher,
769        score_mode: ScoreMode,
770    ) -> Result<Box<dyn BoundSpanQuery>> {
771        let inner_weight = self.inner.bind_span(searcher, score_mode)?;
772        Ok(Box::new(BoundSpanFirstQuery {
773            inner_weight,
774            end: self.end,
775        }))
776    }
777}
778
779struct BoundSpanFirstQuery {
780    inner_weight: Box<dyn BoundSpanQuery>,
781    end: u32,
782}
783
784impl BoundQuery for BoundSpanFirstQuery {
785    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
786        // Route through the inner's span_scorer_supplier so the end
787        // constraint reaches the underlying TermSpans / NearSpans
788        // iterator via a FilterSpans wrapper. The inner is statically
789        // typed Box<dyn BoundSpanQuery>, so only span types are
790        // reachable here.
791        self.inner_weight.span_scorer_supplier(reader, self.end)
792    }
793}
794
795impl BoundSpanQuery for BoundSpanFirstQuery {
796    fn span_scorer_supplier(
797        &self,
798        reader: &SegmentReader,
799        max_end: u32,
800    ) -> Result<Option<Box<dyn ScorerSupplier>>> {
801        // Nested SpanFirst: take the tighter of the two end constraints.
802        self.inner_weight
803            .span_scorer_supplier(reader, max_end.min(self.end))
804    }
805}
806
807// ---------------------------------------------------------------------------
808// SpanTermQuery
809
810// ---------------------------------------------------------------------------
811
812pub struct SpanTermQuery {
813    pub field: String,
814    pub value: String,
815}
816
817impl Query for SpanTermQuery {
818    fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
819        Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
820    }
821}
822
823impl SpanQuery for SpanTermQuery {
824    fn bind_span(&self, searcher: &Searcher, _: ScoreMode) -> Result<Box<dyn BoundSpanQuery>> {
825        let total_docs = searcher.total_docs();
826        let doc_freq = searcher.doc_freq(&self.field, &self.value);
827        let idf = bm25_idf(total_docs, doc_freq);
828        let avg_field_length = searcher.avg_field_length(&self.field);
829        Ok(Box::new(BoundSpanTermQuery {
830            field: self.field.clone(),
831            value: self.value.clone(),
832            idf,
833            avg_field_length,
834        }))
835    }
836}
837
838struct BoundSpanTermQuery {
839    field: String,
840    value: String,
841    idf: f32,
842    avg_field_length: f32,
843}
844
845impl BoundQuery for BoundSpanTermQuery {
846    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
847        let field_id = match reader
848            .header()
849            .fields
850            .iter()
851            .find(|f| f.field_name == self.field)
852            .map(|f| f.field_id)
853        {
854            Some(id) => id,
855            None => return Ok(None),
856        };
857        if reader
858            .postings_with_positions(field_id, &self.value)
859            .is_none()
860        {
861            return Ok(None);
862        }
863        Ok(Some(Box::new(SpanTermScorerSupplier {
864            segment: reader as *const SegmentReader,
865            field_id,
866            value: self.value.clone(),
867            idf: self.idf,
868            avg_field_length: self.avg_field_length,
869        })))
870    }
871}
872
873impl BoundSpanQuery for BoundSpanTermQuery {
874    fn span_scorer_supplier(
875        &self,
876        reader: &SegmentReader,
877        max_end: u32,
878    ) -> Result<Option<Box<dyn ScorerSupplier>>> {
879        let field_id = match reader
880            .header()
881            .fields
882            .iter()
883            .find(|f| f.field_name == self.field)
884            .map(|f| f.field_id)
885        {
886            Some(id) => id,
887            None => return Ok(None),
888        };
889        if reader
890            .postings_with_positions(field_id, &self.value)
891            .is_none()
892        {
893            return Ok(None);
894        }
895        Ok(Some(Box::new(FilteredSpanTermScorerSupplier {
896            segment: reader as *const SegmentReader,
897            field_id,
898            value: self.value.clone(),
899            idf: self.idf,
900            avg_field_length: self.avg_field_length,
901            max_end,
902        })))
903    }
904}
905
906struct SpanTermScorerSupplier {
907    segment: *const SegmentReader,
908    field_id: FieldId,
909    value: String,
910    idf: f32,
911    avg_field_length: f32,
912}
913unsafe impl Send for SpanTermScorerSupplier {}
914
915impl ScorerSupplier for SpanTermScorerSupplier {
916    fn cost(&self) -> u64 {
917        1000
918    }
919    fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
920        let reader = unsafe { &*self.segment };
921        let pos_reader = reader
922            .postings_with_positions(self.field_id, &self.value)
923            .unwrap();
924        let norms = reader.norms(self.field_id);
925        let mut spans = TermSpans::new(pos_reader);
926        spans.next_doc(); // position on first doc
927        Ok(Box::new(SimpleSpanScorer {
928            spans,
929            idf: self.idf,
930            avg_field_length: self.avg_field_length,
931            norms,
932        }))
933    }
934}
935
936/// Supplier for a SpanTerm wrapped in SpanFirst: produces a scorer that
937/// emits only docs where at least one span has end_position <= max_end,
938/// and scores by BM25 with tf = count of matching spans (matching
939/// Lucene's SpanScorer semantics).
940struct FilteredSpanTermScorerSupplier {
941    segment: *const SegmentReader,
942    field_id: FieldId,
943    value: String,
944    idf: f32,
945    avg_field_length: f32,
946    max_end: u32,
947}
948unsafe impl Send for FilteredSpanTermScorerSupplier {}
949
950impl ScorerSupplier for FilteredSpanTermScorerSupplier {
951    fn cost(&self) -> u64 {
952        1000
953    }
954    fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
955        let reader = unsafe { &*self.segment };
956        let pos_reader = reader
957            .postings_with_positions(self.field_id, &self.value)
958            .unwrap();
959        let norms = reader.norms(self.field_id);
960        let mut inner = TermSpans::new(pos_reader);
961        inner.next_doc();
962        let spans = FilterSpans {
963            inner,
964            max_end: self.max_end,
965        };
966        let mut scorer = FilteredSpanTermScorer {
967            spans,
968            idf: self.idf,
969            avg_field_length: self.avg_field_length,
970            norms,
971            freq: 0.0,
972        };
973        scorer.find_next_matching_doc();
974        Ok(Box::new(scorer))
975    }
976}
977
978/// Scorer for SpanFirst(SpanTerm): iterates positions per doc through
979/// FilterSpans, emits only docs where at least one span survived the
980/// filter, scores by BM25 with tf = surviving-span count.
981struct FilteredSpanTermScorer<'a> {
982    spans: FilterSpans<TermSpans<'a>>,
983    idf: f32,
984    avg_field_length: f32,
985    norms: Option<FieldNormsReader<'a>>,
986    freq: f32,
987}
988
989unsafe impl Send for FilteredSpanTermScorer<'_> {}
990
991impl FilteredSpanTermScorer<'_> {
992    fn find_next_matching_doc(&mut self) -> DocId {
993        loop {
994            if self.spans.doc_id() == NO_MORE_DOCS {
995                self.freq = 0.0;
996                return NO_MORE_DOCS;
997            }
998            let mut freq = 0.0f32;
999            while self.spans.next_start_position() != NO_MORE_POSITIONS {
1000                freq += 1.0;
1001            }
1002            if freq > 0.0 {
1003                self.freq = freq;
1004                return self.spans.doc_id();
1005            }
1006            self.spans.next_doc();
1007        }
1008    }
1009}
1010
1011impl Scorer for FilteredSpanTermScorer<'_> {
1012    fn doc_id(&self) -> DocId {
1013        self.spans.doc_id()
1014    }
1015    fn next(&mut self) -> DocId {
1016        self.spans.next_doc();
1017        self.find_next_matching_doc()
1018    }
1019    fn advance(&mut self, target: DocId) -> DocId {
1020        self.spans.advance_doc(target);
1021        self.find_next_matching_doc()
1022    }
1023    fn score(&mut self) -> f32 {
1024        let dl = self
1025            .norms
1026            .as_ref()
1027            .map(|n| n.norm(self.doc_id()))
1028            .unwrap_or(1.0);
1029        bm25_score(self.idf, self.freq, dl, self.avg_field_length)
1030    }
1031    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
1032        None
1033    }
1034}
1035
1036// ---------------------------------------------------------------------------
1037// SpanNearQuery (ordered)
1038// ---------------------------------------------------------------------------
1039
1040pub struct SpanNearQuery {
1041    pub field: String,
1042    pub terms: Vec<String>,
1043    pub slop: u32,
1044    pub in_order: bool,
1045}
1046
1047impl Query for SpanNearQuery {
1048    fn bind(&self, searcher: &Searcher, score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
1049        Ok(<Self as SpanQuery>::bind_span(self, searcher, score_mode)?)
1050    }
1051}
1052
1053impl SpanQuery for SpanNearQuery {
1054    fn bind_span(&self, searcher: &Searcher, _: ScoreMode) -> Result<Box<dyn BoundSpanQuery>> {
1055        // Phrase IDF: sum of individual term IDFs (matches Lucene's
1056        // PhraseQuery convention).
1057        let total_docs = searcher.total_docs();
1058        let idf: f32 = self
1059            .terms
1060            .iter()
1061            .map(|t| bm25_idf(total_docs, searcher.doc_freq(&self.field, t)))
1062            .sum();
1063        let avg_field_length = searcher.avg_field_length(&self.field);
1064        Ok(Box::new(BoundSpanNearQuery {
1065            field: self.field.clone(),
1066            terms: self.terms.clone(),
1067            slop: self.slop,
1068            in_order: self.in_order,
1069            idf,
1070            avg_field_length,
1071        }))
1072    }
1073}
1074
1075struct BoundSpanNearQuery {
1076    field: String,
1077    terms: Vec<String>,
1078    slop: u32,
1079    in_order: bool,
1080    idf: f32,
1081    avg_field_length: f32,
1082}
1083
1084impl BoundQuery for BoundSpanNearQuery {
1085    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
1086        let field_id = match reader
1087            .header()
1088            .fields
1089            .iter()
1090            .find(|f| f.field_name == self.field)
1091            .map(|f| f.field_id)
1092        {
1093            Some(id) => id,
1094            None => return Ok(None),
1095        };
1096        // All terms must exist
1097        for term in &self.terms {
1098            if reader.postings_with_positions(field_id, term).is_none() {
1099                return Ok(None);
1100            }
1101        }
1102        Ok(Some(Box::new(SpanNearScorerSupplier {
1103            segment: reader as *const SegmentReader,
1104            field_id,
1105            terms: self.terms.clone(),
1106            slop: self.slop,
1107            in_order: self.in_order,
1108            idf: self.idf,
1109            avg_field_length: self.avg_field_length,
1110            max_end: None,
1111        })))
1112    }
1113}
1114
1115impl BoundSpanQuery for BoundSpanNearQuery {
1116    fn span_scorer_supplier(
1117        &self,
1118        reader: &SegmentReader,
1119        max_end: u32,
1120    ) -> Result<Option<Box<dyn ScorerSupplier>>> {
1121        let field_id = match reader
1122            .header()
1123            .fields
1124            .iter()
1125            .find(|f| f.field_name == self.field)
1126            .map(|f| f.field_id)
1127        {
1128            Some(id) => id,
1129            None => return Ok(None),
1130        };
1131        for term in &self.terms {
1132            if reader.postings_with_positions(field_id, term).is_none() {
1133                return Ok(None);
1134            }
1135        }
1136        Ok(Some(Box::new(SpanNearScorerSupplier {
1137            segment: reader as *const SegmentReader,
1138            field_id,
1139            terms: self.terms.clone(),
1140            slop: self.slop,
1141            in_order: self.in_order,
1142            idf: self.idf,
1143            avg_field_length: self.avg_field_length,
1144            max_end: Some(max_end),
1145        })))
1146    }
1147}
1148
1149struct SpanNearScorerSupplier {
1150    segment: *const SegmentReader,
1151    field_id: FieldId,
1152    terms: Vec<String>,
1153    slop: u32,
1154    in_order: bool,
1155    idf: f32,
1156    avg_field_length: f32,
1157    /// When `Some`, the scorer wraps the NearSpans iterator in
1158    /// `FilterSpans` and emits only docs where at least one span has
1159    /// `end_position() <= max_end`. Used for `SpanFirst(SpanNear, end)`.
1160    max_end: Option<u32>,
1161}
1162unsafe impl Send for SpanNearScorerSupplier {}
1163
1164impl ScorerSupplier for SpanNearScorerSupplier {
1165    fn cost(&self) -> u64 {
1166        1000
1167    }
1168    fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
1169        let reader = unsafe { &*self.segment };
1170        let sub_spans: Vec<TermSpans> = self
1171            .terms
1172            .iter()
1173            .map(|t| TermSpans::new(reader.postings_with_positions(self.field_id, t).unwrap()))
1174            .collect();
1175        let norms = reader.norms(self.field_id);
1176
1177        match (self.in_order, self.max_end) {
1178            (true, None) => {
1179                let mut spans = NearSpansOrdered::new(sub_spans, self.slop);
1180                spans.next_doc();
1181                let mut scorer = TwoPhaseSpanScorer {
1182                    spans,
1183                    idf: self.idf,
1184                    avg_field_length: self.avg_field_length,
1185                    norms,
1186                    sloppy_freq: 0.0,
1187                };
1188                scorer.find_next_matching_doc();
1189                Ok(Box::new(scorer))
1190            }
1191            (false, None) => {
1192                let mut spans = NearSpansUnordered::new(sub_spans, self.slop);
1193                spans.next_doc();
1194                let mut scorer = TwoPhaseSpanScorerUnordered {
1195                    spans,
1196                    idf: self.idf,
1197                    avg_field_length: self.avg_field_length,
1198                    norms,
1199                    sloppy_freq: 0.0,
1200                };
1201                scorer.find_next_matching_doc();
1202                Ok(Box::new(scorer))
1203            }
1204            (true, Some(max_end)) => {
1205                let mut inner = NearSpansOrdered::new(sub_spans, self.slop);
1206                inner.next_doc();
1207                let spans = FilterSpans { inner, max_end };
1208                let mut scorer = FilteredNearSpanScorer {
1209                    spans,
1210                    idf: self.idf,
1211                    avg_field_length: self.avg_field_length,
1212                    norms,
1213                    sloppy_freq: 0.0,
1214                };
1215                scorer.find_next_matching_doc();
1216                Ok(Box::new(scorer))
1217            }
1218            (false, Some(max_end)) => {
1219                let mut inner = NearSpansUnordered::new(sub_spans, self.slop);
1220                inner.next_doc();
1221                let spans = FilterSpans { inner, max_end };
1222                let mut scorer = FilteredNearSpanScorer {
1223                    spans,
1224                    idf: self.idf,
1225                    avg_field_length: self.avg_field_length,
1226                    norms,
1227                    sloppy_freq: 0.0,
1228                };
1229                scorer.find_next_matching_doc();
1230                Ok(Box::new(scorer))
1231            }
1232        }
1233    }
1234}
1235
1236/// Scorer for `SpanFirst(SpanNear)`: iterates positions through
1237/// `FilterSpans<NearSpans*>`, accumulating sloppy frequency over
1238/// surviving spans. Emits only docs where at least one span had
1239/// `end_position() <= max_end`.
1240///
1241/// Generic over the near-spans variant (ordered or unordered) so
1242/// one scorer type serves both.
1243struct FilteredNearSpanScorer<'a, S: Spans> {
1244    spans: FilterSpans<S>,
1245    idf: f32,
1246    avg_field_length: f32,
1247    norms: Option<FieldNormsReader<'a>>,
1248    sloppy_freq: f32,
1249}
1250
1251unsafe impl<S: Spans> Send for FilteredNearSpanScorer<'_, S> {}
1252
1253impl<S: Spans> FilteredNearSpanScorer<'_, S> {
1254    fn find_next_matching_doc(&mut self) -> DocId {
1255        loop {
1256            if self.spans.doc_id() == NO_MORE_DOCS {
1257                self.sloppy_freq = 0.0;
1258                return NO_MORE_DOCS;
1259            }
1260            let mut freq = 0.0f32;
1261            while self.spans.next_start_position() != NO_MORE_POSITIONS {
1262                freq += 1.0 / (1.0 + self.spans.width() as f32);
1263            }
1264            if freq > 0.0 {
1265                self.sloppy_freq = freq;
1266                return self.spans.doc_id();
1267            }
1268            self.spans.next_doc();
1269        }
1270    }
1271}
1272
1273impl<S: Spans> Scorer for FilteredNearSpanScorer<'_, S> {
1274    fn doc_id(&self) -> DocId {
1275        self.spans.doc_id()
1276    }
1277    fn next(&mut self) -> DocId {
1278        self.spans.next_doc();
1279        self.find_next_matching_doc()
1280    }
1281    fn advance(&mut self, target: DocId) -> DocId {
1282        self.spans.advance_doc(target);
1283        self.find_next_matching_doc()
1284    }
1285    fn score(&mut self) -> f32 {
1286        let dl = self
1287            .norms
1288            .as_ref()
1289            .map(|n| n.norm(self.doc_id()))
1290            .unwrap_or(1.0);
1291        bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
1292    }
1293    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
1294        None
1295    }
1296}
1297
1298// ---------------------------------------------------------------------------
1299// Scorer implementations that wrap Spans
1300// ---------------------------------------------------------------------------
1301
1302/// Simple span scorer for span_term: every doc with the term matches.
1303/// Computes BM25 score using term frequency from the underlying reader.
1304struct SimpleSpanScorer<'a> {
1305    spans: TermSpans<'a>,
1306    idf: f32,
1307    avg_field_length: f32,
1308    norms: Option<FieldNormsReader<'a>>,
1309}
1310
1311unsafe impl Send for SimpleSpanScorer<'_> {}
1312
1313impl Scorer for SimpleSpanScorer<'_> {
1314    fn doc_id(&self) -> DocId {
1315        self.spans.doc_id()
1316    }
1317    fn next(&mut self) -> DocId {
1318        self.spans.next_doc()
1319    }
1320    fn advance(&mut self, target: DocId) -> DocId {
1321        self.spans.advance_doc(target)
1322    }
1323    fn score(&mut self) -> f32 {
1324        let tf = self.spans.current_tf as f32;
1325        let dl = self
1326            .norms
1327            .as_ref()
1328            .map(|n| n.norm(self.doc_id()))
1329            .unwrap_or(1.0);
1330        bm25_score(self.idf, tf, dl, self.avg_field_length)
1331    }
1332    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
1333        None
1334    }
1335}
1336
1337/// Two-phase span scorer for span_near: doc conjunction + position check.
1338/// Iterates through documents where all terms co-occur, then checks positions.
1339///
1340/// Uses Lucene's sloppy frequency: each span match contributes
1341/// `1.0 / (1.0 + width)` to the BM25 TF, penalizing wider (sloppier)
1342/// matches. See [[investigation-20260405-05-span-not-constant-score]].
1343struct TwoPhaseSpanScorer<'a> {
1344    spans: NearSpansOrdered<'a>,
1345    idf: f32,
1346    avg_field_length: f32,
1347    norms: Option<FieldNormsReader<'a>>,
1348    /// Sloppy frequency for BM25 (sum of 1/(1+width) over matches).
1349    sloppy_freq: f32,
1350}
1351
1352unsafe impl Send for TwoPhaseSpanScorer<'_> {}
1353
1354impl TwoPhaseSpanScorer<'_> {
1355    /// Find next doc where positions satisfy the near constraint, accumulating
1356    /// sloppy frequency over all span matches in that doc.
1357    fn find_next_matching_doc(&mut self) -> DocId {
1358        loop {
1359            if self.spans.current_doc == NO_MORE_DOCS {
1360                self.sloppy_freq = 0.0;
1361                return NO_MORE_DOCS;
1362            }
1363            // Accumulate sloppy freq over all matches in this doc.
1364            // Lucene SpanScorer.setFreqCurrentDoc().
1365            let mut freq: f32 = 0.0;
1366            while self.spans.next_start_position() != NO_MORE_POSITIONS {
1367                freq += 1.0 / (1.0 + self.spans.width() as f32);
1368            }
1369            if freq > 0.0 {
1370                self.sloppy_freq = freq;
1371                return self.spans.current_doc;
1372            }
1373            self.spans.next_doc();
1374        }
1375    }
1376}
1377
1378impl Scorer for TwoPhaseSpanScorer<'_> {
1379    fn doc_id(&self) -> DocId {
1380        self.spans.doc_id()
1381    }
1382    fn next(&mut self) -> DocId {
1383        self.spans.next_doc();
1384        self.find_next_matching_doc()
1385    }
1386    fn advance(&mut self, target: DocId) -> DocId {
1387        self.spans.advance_doc(target);
1388        self.find_next_matching_doc()
1389    }
1390    fn score(&mut self) -> f32 {
1391        let dl = self
1392            .norms
1393            .as_ref()
1394            .map(|n| n.norm(self.doc_id()))
1395            .unwrap_or(1.0);
1396        bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
1397    }
1398    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
1399        None
1400    }
1401}
1402
1403/// Two-phase span scorer for unordered span_near.
1404///
1405/// NOTE: NearSpansUnordered currently returns at most one match per doc
1406/// (see comment in find_match_unordered). Sloppy freq for unordered
1407/// will be at most one contribution per doc until full counting lands.
1408struct TwoPhaseSpanScorerUnordered<'a> {
1409    spans: NearSpansUnordered<'a>,
1410    idf: f32,
1411    avg_field_length: f32,
1412    norms: Option<FieldNormsReader<'a>>,
1413    sloppy_freq: f32,
1414}
1415
1416unsafe impl Send for TwoPhaseSpanScorerUnordered<'_> {}
1417
1418impl TwoPhaseSpanScorerUnordered<'_> {
1419    fn find_next_matching_doc(&mut self) -> DocId {
1420        loop {
1421            if self.spans.current_doc == NO_MORE_DOCS {
1422                self.sloppy_freq = 0.0;
1423                return NO_MORE_DOCS;
1424            }
1425            let mut freq: f32 = 0.0;
1426            while self.spans.next_start_position() != NO_MORE_POSITIONS {
1427                freq += 1.0 / (1.0 + self.spans.width() as f32);
1428            }
1429            if freq > 0.0 {
1430                self.sloppy_freq = freq;
1431                return self.spans.current_doc;
1432            }
1433            self.spans.next_doc();
1434        }
1435    }
1436}
1437
1438impl Scorer for TwoPhaseSpanScorerUnordered<'_> {
1439    fn doc_id(&self) -> DocId {
1440        self.spans.doc_id()
1441    }
1442    fn next(&mut self) -> DocId {
1443        self.spans.next_doc();
1444        self.find_next_matching_doc()
1445    }
1446    fn advance(&mut self, target: DocId) -> DocId {
1447        self.spans.advance_doc(target);
1448        self.find_next_matching_doc()
1449    }
1450    fn score(&mut self) -> f32 {
1451        let dl = self
1452            .norms
1453            .as_ref()
1454            .map(|n| n.norm(self.doc_id()))
1455            .unwrap_or(1.0);
1456        bm25_score(self.idf, self.sloppy_freq, dl, self.avg_field_length)
1457    }
1458
1459    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
1460        None
1461    }
1462}
1463
1464// ---------------------------------------------------------------------------
1465// Tests
1466// ---------------------------------------------------------------------------
1467
1468#[cfg(test)]
1469mod tests {
1470    use super::*;
1471    use crate::analysis::Token;
1472    use crate::core::SegmentId;
1473    use crate::mapping::{FieldType, Mapping};
1474    use crate::segment::builder::SegmentBuilder;
1475
1476    fn make_tokens(terms: &[&str]) -> Vec<Token> {
1477        terms
1478            .iter()
1479            .enumerate()
1480            .map(|(i, t)| Token::new(*t, 0, t.len(), i as u32))
1481            .collect()
1482    }
1483
1484    fn build_store(docs: &[&[&str]]) -> crate::search::segment_store::SegmentStore {
1485        let schema = Mapping::builder().field("text", FieldType::Text).build();
1486        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
1487        for terms in docs {
1488            builder.add_document(&[(FieldId::new(0), make_tokens(terms))], b"{}");
1489        }
1490        let reader = SegmentReader::open(builder.build()).unwrap();
1491        crate::search::segment_store::SegmentStore::new(
1492            vec![reader],
1493            crate::analysis::AnalyzerRegistry::new(),
1494            None,
1495            None,
1496        )
1497    }
1498
1499    #[test]
1500    fn span_term_basic() {
1501        let store = build_store(&[
1502            &["the", "quick", "brown", "fox"],
1503            &["the", "lazy", "dog"],
1504            &["quick", "fox"],
1505        ]);
1506        let searcher = Searcher::new(&store);
1507        let results = searcher
1508            .search_query(
1509                &SpanTermQuery {
1510                    field: "text".into(),
1511                    value: "quick".into(),
1512                },
1513                10,
1514                0,
1515            )
1516            .unwrap();
1517        assert_eq!(results.total_hits.value, 2); // docs 0 and 2
1518    }
1519
1520    #[test]
1521    fn span_term_missing() {
1522        let store = build_store(&[&["the", "quick"]]);
1523        let searcher = Searcher::new(&store);
1524        let results = searcher
1525            .search_query(
1526                &SpanTermQuery {
1527                    field: "text".into(),
1528                    value: "nonexistent".into(),
1529                },
1530                10,
1531                0,
1532            )
1533            .unwrap();
1534        assert_eq!(results.total_hits.value, 0);
1535    }
1536
1537    #[test]
1538    fn span_near_exact_phrase() {
1539        // slop=0, in_order=true → exact phrase match
1540        let store = build_store(&[
1541            &["the", "quick", "brown", "fox"], // "quick brown" at [1,2]
1542            &["brown", "quick", "fox"],        // "quick" at [1], "brown" at [0] — wrong order
1543            &["quick", "brown"],               // exact match at [0,1]
1544        ]);
1545        let searcher = Searcher::new(&store);
1546        let results = searcher
1547            .search_query(
1548                &SpanNearQuery {
1549                    field: "text".into(),
1550                    terms: vec!["quick".into(), "brown".into()],
1551                    slop: 0,
1552                    in_order: true,
1553                },
1554                10,
1555                0,
1556            )
1557            .unwrap();
1558        assert_eq!(results.total_hits.value, 2); // docs 0 and 2
1559    }
1560
1561    #[test]
1562    fn span_near_with_slop() {
1563        let store = build_store(&[
1564            &["quick", "brown", "fox"],  // quick(0) fox(2): gap=1
1565            &["quick", "fox"],           // quick(0) fox(1): gap=0
1566            &["quick", "a", "b", "fox"], // quick(0) fox(3): gap=2
1567        ]);
1568        let searcher = Searcher::new(&store);
1569        let results = searcher
1570            .search_query(
1571                &SpanNearQuery {
1572                    field: "text".into(),
1573                    terms: vec!["quick".into(), "fox".into()],
1574                    slop: 1,
1575                    in_order: true,
1576                },
1577                10,
1578                0,
1579            )
1580            .unwrap();
1581        assert_eq!(results.total_hits.value, 2); // docs 0 (gap=1) and 1 (gap=0)
1582    }
1583
1584    #[test]
1585    fn span_near_no_match() {
1586        let store = build_store(&[
1587            &["quick", "a", "b", "c", "fox"], // gap=3, too far for slop=1
1588        ]);
1589        let searcher = Searcher::new(&store);
1590        let results = searcher
1591            .search_query(
1592                &SpanNearQuery {
1593                    field: "text".into(),
1594                    terms: vec!["quick".into(), "fox".into()],
1595                    slop: 1,
1596                    in_order: true,
1597                },
1598                10,
1599                0,
1600            )
1601            .unwrap();
1602        assert_eq!(results.total_hits.value, 0);
1603    }
1604
1605    #[test]
1606    fn span_near_three_terms() {
1607        let store = build_store(&[
1608            &["the", "quick", "brown", "fox"], // quick(1) brown(2) fox(3): consecutive
1609            &["quick", "fox", "brown"],        // wrong order for brown/fox
1610        ]);
1611        let searcher = Searcher::new(&store);
1612        let results = searcher
1613            .search_query(
1614                &SpanNearQuery {
1615                    field: "text".into(),
1616                    terms: vec!["quick".into(), "brown".into(), "fox".into()],
1617                    slop: 0,
1618                    in_order: true,
1619                },
1620                10,
1621                0,
1622            )
1623            .unwrap();
1624        assert_eq!(results.total_hits.value, 1); // only doc 0
1625    }
1626
1627    #[test]
1628    fn span_near_wrong_order() {
1629        let store = build_store(&[
1630            &["fox", "quick"], // fox before quick — doesn't match ordered
1631        ]);
1632        let searcher = Searcher::new(&store);
1633        let results = searcher
1634            .search_query(
1635                &SpanNearQuery {
1636                    field: "text".into(),
1637                    terms: vec!["quick".into(), "fox".into()],
1638                    slop: 5,
1639                    in_order: true,
1640                },
1641                10,
1642                0,
1643            )
1644            .unwrap();
1645        assert_eq!(results.total_hits.value, 0);
1646    }
1647
1648    #[test]
1649    fn span_near_one_term_missing() {
1650        let store = build_store(&[&["quick", "brown"]]);
1651        let searcher = Searcher::new(&store);
1652        let results = searcher
1653            .search_query(
1654                &SpanNearQuery {
1655                    field: "text".into(),
1656                    terms: vec!["quick".into(), "nonexistent".into()],
1657                    slop: 10,
1658                    in_order: true,
1659                },
1660                10,
1661                0,
1662            )
1663            .unwrap();
1664        assert_eq!(results.total_hits.value, 0);
1665    }
1666
1667    // --- Unordered tests ---
1668
1669    #[test]
1670    fn span_near_unordered_basic() {
1671        let store = build_store(&[
1672            &["the", "fox", "quick"],         // fox(1) quick(2): distance 1
1673            &["quick", "a", "b", "c", "fox"], // quick(0) fox(4): distance 3
1674        ]);
1675        let searcher = Searcher::new(&store);
1676        let results = searcher
1677            .search_query(
1678                &SpanNearQuery {
1679                    field: "text".into(),
1680                    terms: vec!["quick".into(), "fox".into()],
1681                    slop: 1,
1682                    in_order: false,
1683                },
1684                10,
1685                0,
1686            )
1687            .unwrap();
1688        assert_eq!(results.total_hits.value, 1); // only doc 0 (window=1)
1689    }
1690
1691    #[test]
1692    fn span_near_unordered_reversed() {
1693        // "fox" before "quick" — should match unordered but not ordered
1694        let store = build_store(&[&["fox", "quick"]]);
1695        let searcher = Searcher::new(&store);
1696        let ordered = searcher
1697            .search_query(
1698                &SpanNearQuery {
1699                    field: "text".into(),
1700                    terms: vec!["quick".into(), "fox".into()],
1701                    slop: 1,
1702                    in_order: true,
1703                },
1704                10,
1705                0,
1706            )
1707            .unwrap();
1708        assert_eq!(ordered.total_hits.value, 0); // ordered: doesn't match
1709
1710        let unordered = searcher
1711            .search_query(
1712                &SpanNearQuery {
1713                    field: "text".into(),
1714                    terms: vec!["quick".into(), "fox".into()],
1715                    slop: 0,
1716                    in_order: false,
1717                },
1718                10,
1719                0,
1720            )
1721            .unwrap();
1722        assert_eq!(unordered.total_hits.value, 1); // unordered: matches
1723    }
1724
1725    // --- SpanNot tests ---
1726
1727    #[test]
1728    fn span_not_basic() {
1729        let store = build_store(&[
1730            &["quick", "fox"],   // has "quick" and "fox"
1731            &["quick", "brown"], // has "quick" but not "fox"
1732            &["slow", "dog"],    // doesn't have "quick"
1733        ]);
1734        let searcher = Searcher::new(&store);
1735        let results = searcher
1736            .search_query(
1737                &SpanNotQuery {
1738                    include: Box::new(SpanTermQuery {
1739                        field: "text".into(),
1740                        value: "quick".into(),
1741                    }),
1742                    exclude: Box::new(SpanTermQuery {
1743                        field: "text".into(),
1744                        value: "fox".into(),
1745                    }),
1746                },
1747                10,
1748                0,
1749            )
1750            .unwrap();
1751        assert_eq!(results.total_hits.value, 1); // only doc 1 (has quick, no fox)
1752    }
1753
1754    #[test]
1755    fn span_not_no_exclusions() {
1756        let store = build_store(&[&["quick", "fox"], &["quick", "brown"]]);
1757        let searcher = Searcher::new(&store);
1758        let results = searcher
1759            .search_query(
1760                &SpanNotQuery {
1761                    include: Box::new(SpanTermQuery {
1762                        field: "text".into(),
1763                        value: "quick".into(),
1764                    }),
1765                    exclude: Box::new(SpanTermQuery {
1766                        field: "text".into(),
1767                        value: "nonexistent".into(),
1768                    }),
1769                },
1770                10,
1771                0,
1772            )
1773            .unwrap();
1774        assert_eq!(results.total_hits.value, 2); // nothing excluded
1775    }
1776
1777    /// Regression test for [[investigation-20260405-05-span-not-constant-score]].
1778    ///
1779    /// SpanTermScorer hardcoded score=1.0 instead of computing BM25.
1780    /// A doc with the term repeated multiple times should score higher
1781    /// than a doc with one occurrence (different TF).
1782    #[test]
1783    fn span_term_score_uses_bm25_tf() {
1784        let store = build_store(&[
1785            &["search", "engine", "search"], // tf=2 for "search"
1786            &["search", "tools"],            // tf=1 for "search"
1787        ]);
1788        let searcher = Searcher::new(&store);
1789        let query = SpanTermQuery {
1790            field: "text".into(),
1791            value: "search".into(),
1792        };
1793
1794        let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
1795        let supplier = weight
1796            .scorer_supplier(&searcher.segments()[0])
1797            .unwrap()
1798            .unwrap();
1799        let mut scorer = supplier.scorer().unwrap();
1800
1801        // Doc 0 has tf=2, Doc 1 has tf=1
1802        assert_eq!(scorer.doc_id(), DocId::new(0));
1803        let doc0_score = scorer.score();
1804        scorer.next();
1805        assert_eq!(scorer.doc_id(), DocId::new(1));
1806        let doc1_score = scorer.score();
1807
1808        assert!(
1809            doc0_score > doc1_score,
1810            "doc with tf=2 ({doc0_score}) must score higher than doc with tf=1 \
1811             ({doc1_score}) — span_term must use BM25 TF, not hardcoded 1.0"
1812        );
1813    }
1814
1815    /// Regression test: SpanTermScorer score must equal TermQuery score.
1816    #[test]
1817    fn span_term_score_matches_term_query() {
1818        let store = build_store(&[&["search", "engine", "search"], &["search", "tools"]]);
1819        let searcher = Searcher::new(&store);
1820
1821        let span_query = SpanTermQuery {
1822            field: "text".into(),
1823            value: "search".into(),
1824        };
1825        let term_query = crate::query::term::TermQuery {
1826            field: "text".into(),
1827            value: "search".into(),
1828        };
1829
1830        let span_weight = span_query.bind(&searcher, ScoreMode::Complete).unwrap();
1831        let span_supplier = span_weight
1832            .scorer_supplier(&searcher.segments()[0])
1833            .unwrap()
1834            .unwrap();
1835        let mut span_scorer = span_supplier.scorer().unwrap();
1836
1837        let term_weight = term_query.bind(&searcher, ScoreMode::Complete).unwrap();
1838        let term_supplier = term_weight
1839            .scorer_supplier(&searcher.segments()[0])
1840            .unwrap()
1841            .unwrap();
1842        let mut term_scorer = term_supplier.scorer().unwrap();
1843
1844        // Both should produce the same score for each doc
1845        for _ in 0..2 {
1846            assert_eq!(span_scorer.doc_id(), term_scorer.doc_id());
1847            let span_score = span_scorer.score();
1848            let term_score = term_scorer.score();
1849            assert!(
1850                (span_score - term_score).abs() < 1e-5,
1851                "span_term score ({span_score}) must equal term query score ({term_score}) \
1852                 for doc {:?}",
1853                span_scorer.doc_id()
1854            );
1855            span_scorer.next();
1856            term_scorer.next();
1857        }
1858    }
1859
1860    /// Regression test: SpanNear with sloppy frequency.
1861    ///
1862    /// Lucene's algorithm: each match contributes 1.0 / (1.0 + width).
1863    /// An exact match (width=0) contributes 1.0; a wide match contributes
1864    /// less. A doc with one exact match should score higher than a doc
1865    /// with one match using slop (same field length).
1866    #[test]
1867    fn span_near_sloppy_freq_penalizes_width() {
1868        // Both docs have the same length (5 tokens) and one match each,
1869        // but the match width differs.
1870        // Doc 0: "quick brown a b c"   — quick(0) brown(1), exact (width 0)
1871        // Doc 1: "quick a b brown c"   — quick(0) brown(3), gap=2  (width 2)
1872        let store = build_store(&[
1873            &["quick", "brown", "a", "b", "c"],
1874            &["quick", "a", "b", "brown", "c"],
1875        ]);
1876        let searcher = Searcher::new(&store);
1877        let query = SpanNearQuery {
1878            field: "text".into(),
1879            terms: vec!["quick".into(), "brown".into()],
1880            slop: 5,
1881            in_order: true,
1882        };
1883
1884        let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
1885        let supplier = weight
1886            .scorer_supplier(&searcher.segments()[0])
1887            .unwrap()
1888            .unwrap();
1889        let mut scorer = supplier.scorer().unwrap();
1890
1891        // Both should match
1892        assert_eq!(scorer.doc_id(), DocId::new(0));
1893        let exact_score = scorer.score();
1894        scorer.next();
1895        assert_eq!(scorer.doc_id(), DocId::new(1));
1896        let sloppy_score = scorer.score();
1897
1898        // Sloppy match (width 2 → freq 1/3) should score lower than
1899        // exact match (width 0 → freq 1.0). Both have same field length.
1900        assert!(
1901            exact_score > sloppy_score,
1902            "exact match ({exact_score}) must score higher than sloppy match ({sloppy_score}) — \
1903             sloppy frequency must penalize width"
1904        );
1905    }
1906
1907    /// Regression test: SpanNear must use BM25 with span frequency, not 1.0.
1908    #[test]
1909    fn span_near_score_uses_bm25() {
1910        // Doc 0: "quick brown" appears twice
1911        // Doc 1: "quick brown" appears once
1912        // Both docs have the same length to isolate TF effect.
1913        let store = build_store(&[
1914            &["quick", "brown", "and", "quick", "brown", "fox"],
1915            &["quick", "brown", "fox", "and", "lazy", "dog"],
1916        ]);
1917        let searcher = Searcher::new(&store);
1918        let query = SpanNearQuery {
1919            field: "text".into(),
1920            terms: vec!["quick".into(), "brown".into()],
1921            slop: 0,
1922            in_order: true,
1923        };
1924
1925        let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
1926        let supplier = weight
1927            .scorer_supplier(&searcher.segments()[0])
1928            .unwrap()
1929            .unwrap();
1930        let mut scorer = supplier.scorer().unwrap();
1931
1932        assert_eq!(scorer.doc_id(), DocId::new(0));
1933        let doc0_score = scorer.score();
1934        scorer.next();
1935        assert_eq!(scorer.doc_id(), DocId::new(1));
1936        let doc1_score = scorer.score();
1937
1938        assert_ne!(doc0_score, 1.0, "span_near score must not be hardcoded 1.0");
1939        assert!(
1940            doc0_score > doc1_score,
1941            "doc with 2 near matches ({doc0_score}) must score higher than \
1942             doc with 1 near match ({doc1_score})"
1943        );
1944    }
1945
1946    /// Regression test: SpanNot must delegate to include scorer's score,
1947    /// not return constant 1.0.
1948    #[test]
1949    fn span_not_delegates_score() {
1950        let store = build_store(&[
1951            &["search", "engine", "search"], // matches "search", not "lazy"
1952            &["search", "tools"],            // matches "search", not "lazy"
1953        ]);
1954        let searcher = Searcher::new(&store);
1955        let query = SpanNotQuery {
1956            include: Box::new(SpanTermQuery {
1957                field: "text".into(),
1958                value: "search".into(),
1959            }),
1960            exclude: Box::new(SpanTermQuery {
1961                field: "text".into(),
1962                value: "lazy".into(),
1963            }),
1964        };
1965
1966        let weight = query.bind(&searcher, ScoreMode::Complete).unwrap();
1967        let supplier = weight
1968            .scorer_supplier(&searcher.segments()[0])
1969            .unwrap()
1970            .unwrap();
1971        let mut scorer = supplier.scorer().unwrap();
1972
1973        // Doc 0 has tf=2, Doc 1 has tf=1 — different scores via BM25
1974        assert_eq!(scorer.doc_id(), DocId::new(0));
1975        let doc0_score = scorer.score();
1976        scorer.next();
1977        assert_eq!(scorer.doc_id(), DocId::new(1));
1978        let doc1_score = scorer.score();
1979
1980        assert_ne!(doc0_score, 1.0, "span_not score must not be hardcoded 1.0");
1981        assert!(
1982            doc0_score > doc1_score,
1983            "span_not must delegate to include score: doc0 ({doc0_score}) should > doc1 ({doc1_score})"
1984        );
1985    }
1986}