Skip to main content

hermes_core/query/
traits.rs

1//! Query and Scorer traits with async support
2//!
3//! Provides the core abstractions for search queries and document scoring.
4
5use std::future::Future;
6use std::pin::Pin;
7
8use crate::segment::SegmentReader;
9use crate::{DocId, Result, Score};
10
11/// BM25 parameters
12#[derive(Debug, Clone, Copy)]
13pub struct Bm25Params {
14    /// Term frequency saturation parameter (typically 1.2-2.0)
15    pub k1: f32,
16    /// Length normalization parameter (typically 0.75)
17    pub b: f32,
18}
19
20impl Default for Bm25Params {
21    fn default() -> Self {
22        Self { k1: 1.2, b: 0.75 }
23    }
24}
25
26/// Future type for scorer creation
27#[cfg(not(target_arch = "wasm32"))]
28pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + Send + 'a>>;
29#[cfg(target_arch = "wasm32")]
30pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + 'a>>;
31
32/// Future type for count estimation
33#[cfg(not(target_arch = "wasm32"))]
34pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + Send + 'a>>;
35#[cfg(target_arch = "wasm32")]
36pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + 'a>>;
37
38/// Per-document predicate closure type (platform-aware Send+Sync bounds)
39#[cfg(not(target_arch = "wasm32"))]
40pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + Send + Sync + 'a>;
41#[cfg(target_arch = "wasm32")]
42pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + 'a>;
43
44/// Compact bitset indexed by doc_id. O(1) lookup, ~2.25 MB for 18M docs.
45///
46/// Built from posting lists or predicate scans. Used by BMP filtered queries
47/// for fast per-slot predicate evaluation (~2ns per lookup vs ~30-40ns for
48/// a fast-field closure).
49pub struct DocBitset {
50    pub(crate) bits: Vec<u64>,
51}
52
53impl DocBitset {
54    /// Create an empty bitset for `num_docs` documents.
55    pub fn new(num_docs: u32) -> Self {
56        let num_words = (num_docs as usize).div_ceil(64);
57        Self {
58            bits: vec![0u64; num_words],
59        }
60    }
61
62    /// Set bit for `doc_id`.
63    #[inline]
64    pub fn set(&mut self, doc_id: u32) {
65        let word = doc_id as usize / 64;
66        let bit = doc_id as usize % 64;
67        if word < self.bits.len() {
68            self.bits[word] |= 1u64 << bit;
69        }
70    }
71
72    /// Test if `doc_id` is in the bitset.
73    #[inline(always)]
74    pub fn contains(&self, doc_id: u32) -> bool {
75        let word = doc_id as usize / 64;
76        let bit = doc_id as usize % 64;
77        word < self.bits.len() && self.bits[word] & (1u64 << bit) != 0
78    }
79
80    /// Number of set bits (matching docs).
81    pub fn count(&self) -> u32 {
82        self.bits.iter().map(|w| w.count_ones()).sum()
83    }
84
85    /// Build bitset from a predicate by scanning all docs. O(N).
86    pub fn from_predicate(num_docs: u32, pred: &dyn Fn(DocId) -> bool) -> Self {
87        let mut bs = Self::new(num_docs);
88        for doc_id in 0..num_docs {
89            if pred(doc_id) {
90                bs.set(doc_id);
91            }
92        }
93        bs
94    }
95
96    /// In-place OR (union): `self |= other`.
97    pub fn union_with(&mut self, other: &DocBitset) {
98        for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
99            *a |= *b;
100        }
101    }
102
103    /// In-place AND (intersection): `self &= other`.
104    pub fn intersect_with(&mut self, other: &DocBitset) {
105        for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
106            *a &= *b;
107        }
108        // Zero out any words beyond `other`'s length
109        for a in self.bits.iter_mut().skip(other.bits.len()) {
110            *a = 0;
111        }
112    }
113
114    /// In-place ANDNOT (subtract): `self &= !other`.
115    pub fn subtract(&mut self, other: &DocBitset) {
116        for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
117            *a &= !*b;
118        }
119    }
120}
121
122/// Info for MaxScore-optimizable term queries
123#[derive(Debug, Clone)]
124pub struct TermQueryInfo {
125    /// Field being searched
126    pub field: crate::dsl::Field,
127    /// Term bytes (lowercase)
128    pub term: Vec<u8>,
129}
130
131/// Info for MaxScore-optimizable sparse term queries
132#[derive(Debug, Clone, Copy)]
133pub struct SparseTermQueryInfo {
134    /// Sparse vector field
135    pub field: crate::dsl::Field,
136    /// Dimension ID in the sparse vector
137    pub dim_id: u32,
138    /// Query weight for this dimension
139    pub weight: f32,
140    /// MaxScore heap factor (1.0 = exact, lower = approximate)
141    pub heap_factor: f32,
142    /// Multi-value combiner for ordinal deduplication
143    pub combiner: super::MultiValueCombiner,
144    /// Multiplier on executor limit to compensate for ordinal deduplication
145    /// (1.0 = exact, 2.0 = fetch 2x then combine down)
146    pub over_fetch_factor: f32,
147}
148
149/// Decomposition of a query for MaxScore optimization.
150///
151/// The planner inspects this to decide whether to use text MaxScore,
152/// sparse MaxScore, or standard BooleanScorer execution.
153#[derive(Debug, Clone)]
154pub enum QueryDecomposition {
155    /// Single text term — eligible for text MaxScore grouping
156    TextTerm(TermQueryInfo),
157    /// One or more sparse dimensions — eligible for sparse MaxScore
158    SparseTerms(Vec<SparseTermQueryInfo>),
159    /// Not decomposable — falls back to standard execution
160    Opaque,
161}
162
163/// Matched positions for a field (field_id, list of scored positions)
164/// Each position includes its individual score contribution
165pub type MatchedPositions = Vec<(u32, Vec<super::ScoredPosition>)>;
166
167macro_rules! define_query_traits {
168    ($($send_bounds:tt)*) => {
169        /// A search query (async)
170        ///
171        /// Note: `scorer` takes `&self` (not `&'a self`) so that scorers don't borrow the query.
172        /// This enables query composition - queries can create sub-queries locally and get their scorers.
173        /// Implementations must clone/capture any data they need during scorer creation.
174        pub trait Query: std::fmt::Display + $($send_bounds)* {
175            /// Create a scorer for this query against a single segment (async)
176            ///
177            /// The `limit` parameter specifies the maximum number of results to return.
178            /// This is passed from the top-level search limit.
179            ///
180            /// Note: The scorer borrows only the reader, not the query. Implementations
181            /// should capture any needed query data (field, terms, etc.) during creation.
182            fn scorer<'a>(
183                &self,
184                reader: &'a SegmentReader,
185                limit: usize,
186            ) -> ScorerFuture<'a>;
187
188            /// Estimated number of matching documents in a segment (async)
189            fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a>;
190
191            /// Create a scorer synchronously (mmap/RAM only).
192            ///
193            /// Available when the `sync` feature is enabled.
194            /// Default implementation returns an error.
195            #[cfg(feature = "sync")]
196            fn scorer_sync<'a>(
197                &self,
198                reader: &'a SegmentReader,
199                limit: usize,
200            ) -> Result<Box<dyn Scorer + 'a>> {
201                let _ = (reader, limit);
202                Err(crate::error::Error::Query(
203                    "sync scorer not supported for this query type".into(),
204                ))
205            }
206
207            /// Decompose this query for MaxScore optimization.
208            ///
209            /// Returns `TextTerm` for simple term queries, `SparseTerms` for
210            /// sparse vector queries (single or multi-dim), or `Opaque` if
211            /// the query cannot be decomposed.
212            fn decompose(&self) -> QueryDecomposition {
213                QueryDecomposition::Opaque
214            }
215
216            /// True if this query is a pure filter (always scores 1.0, no positions).
217            /// Used by the planner to convert non-selective MUST filters into predicates.
218            fn is_filter(&self) -> bool {
219                false
220            }
221
222            /// For filter queries: return a cheap per-doc predicate against a segment.
223            /// The predicate does O(1) work per doc (e.g., fast-field lookup).
224            fn as_doc_predicate<'a>(
225                &self,
226                _reader: &'a SegmentReader,
227            ) -> Option<DocPredicate<'a>> {
228                None
229            }
230
231            /// Build a compact bitset of matching doc_ids for this query.
232            ///
233            /// Preferred over `as_doc_predicate` for BMP filtered queries because
234            /// bitset lookup is ~2ns vs ~30-40ns for a fast-field closure.
235            /// Default returns None; TermQuery overrides this to build from its
236            /// posting list in O(M) time.
237            fn as_doc_bitset(
238                &self,
239                _reader: &SegmentReader,
240            ) -> Option<DocBitset> {
241                None
242            }
243        }
244
245        /// Scored document stream: a DocSet that also provides scores.
246        pub trait Scorer: super::docset::DocSet + $($send_bounds)* {
247            /// Score for current document
248            fn score(&self) -> Score;
249
250            /// Get matched positions for the current document (if available)
251            /// Returns (field_id, positions) pairs where positions are encoded as per PositionMode
252            fn matched_positions(&self) -> Option<MatchedPositions> {
253                None
254            }
255        }
256    };
257}
258
259#[cfg(not(target_arch = "wasm32"))]
260define_query_traits!(Send + Sync);
261
262#[cfg(target_arch = "wasm32")]
263define_query_traits!();
264
265impl Query for Box<dyn Query> {
266    fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
267        (**self).scorer(reader, limit)
268    }
269
270    fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
271        (**self).count_estimate(reader)
272    }
273
274    fn decompose(&self) -> QueryDecomposition {
275        (**self).decompose()
276    }
277
278    fn is_filter(&self) -> bool {
279        (**self).is_filter()
280    }
281
282    fn as_doc_predicate<'a>(&self, reader: &'a SegmentReader) -> Option<DocPredicate<'a>> {
283        (**self).as_doc_predicate(reader)
284    }
285
286    fn as_doc_bitset(&self, reader: &SegmentReader) -> Option<DocBitset> {
287        (**self).as_doc_bitset(reader)
288    }
289
290    #[cfg(feature = "sync")]
291    fn scorer_sync<'a>(
292        &self,
293        reader: &'a SegmentReader,
294        limit: usize,
295    ) -> Result<Box<dyn Scorer + 'a>> {
296        (**self).scorer_sync(reader, limit)
297    }
298}
299
300/// Empty scorer for terms that don't exist
301pub struct EmptyScorer;
302
303impl super::docset::DocSet for EmptyScorer {
304    fn doc(&self) -> DocId {
305        crate::structures::TERMINATED
306    }
307
308    fn advance(&mut self) -> DocId {
309        crate::structures::TERMINATED
310    }
311
312    fn seek(&mut self, _target: DocId) -> DocId {
313        crate::structures::TERMINATED
314    }
315
316    fn size_hint(&self) -> u32 {
317        0
318    }
319}
320
321impl Scorer for EmptyScorer {
322    fn score(&self) -> Score {
323        0.0
324    }
325}