hermes_core/query/traits.rs
1//! Query and Scorer traits with async support
2//!
3//! Provides the core abstractions for search queries and document scoring.
4
5use std::future::Future;
6use std::pin::Pin;
7
8use crate::segment::SegmentReader;
9use crate::{DocId, Result, Score};
10
11/// BM25 parameters
12#[derive(Debug, Clone, Copy)]
13pub struct Bm25Params {
14 /// Term frequency saturation parameter (typically 1.2-2.0)
15 pub k1: f32,
16 /// Length normalization parameter (typically 0.75)
17 pub b: f32,
18}
19
20impl Default for Bm25Params {
21 fn default() -> Self {
22 Self { k1: 1.2, b: 0.75 }
23 }
24}
25
26/// Future type for scorer creation
27#[cfg(not(target_arch = "wasm32"))]
28pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + Send + 'a>>;
29#[cfg(target_arch = "wasm32")]
30pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + 'a>>;
31
32/// Future type for count estimation
33#[cfg(not(target_arch = "wasm32"))]
34pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + Send + 'a>>;
35#[cfg(target_arch = "wasm32")]
36pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + 'a>>;
37
38/// Per-document predicate closure type (platform-aware Send+Sync bounds)
39#[cfg(not(target_arch = "wasm32"))]
40pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + Send + Sync + 'a>;
41#[cfg(target_arch = "wasm32")]
42pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + 'a>;
43
44/// Compact bitset indexed by doc_id. O(1) lookup, ~2.25 MB for 18M docs.
45///
46/// Built from posting lists or predicate scans. Used by BMP filtered queries
47/// for fast per-slot predicate evaluation (~2ns per lookup vs ~30-40ns for
48/// a fast-field closure).
49pub struct DocBitset {
50 pub(crate) bits: Vec<u64>,
51}
52
53impl DocBitset {
54 /// Create an empty bitset for `num_docs` documents.
55 pub fn new(num_docs: u32) -> Self {
56 let num_words = (num_docs as usize).div_ceil(64);
57 Self {
58 bits: vec![0u64; num_words],
59 }
60 }
61
62 /// Set bit for `doc_id`.
63 #[inline]
64 pub fn set(&mut self, doc_id: u32) {
65 let word = doc_id as usize / 64;
66 let bit = doc_id as usize % 64;
67 if word < self.bits.len() {
68 self.bits[word] |= 1u64 << bit;
69 }
70 }
71
72 /// Test if `doc_id` is in the bitset.
73 #[inline(always)]
74 pub fn contains(&self, doc_id: u32) -> bool {
75 let word = doc_id as usize / 64;
76 let bit = doc_id as usize % 64;
77 word < self.bits.len() && self.bits[word] & (1u64 << bit) != 0
78 }
79
80 /// Number of set bits (matching docs).
81 pub fn count(&self) -> u32 {
82 self.bits.iter().map(|w| w.count_ones()).sum()
83 }
84
85 /// Build bitset from a predicate by scanning all docs. O(N).
86 pub fn from_predicate(num_docs: u32, pred: &dyn Fn(DocId) -> bool) -> Self {
87 let mut bs = Self::new(num_docs);
88 for doc_id in 0..num_docs {
89 if pred(doc_id) {
90 bs.set(doc_id);
91 }
92 }
93 bs
94 }
95
96 /// In-place OR (union): `self |= other`.
97 pub fn union_with(&mut self, other: &DocBitset) {
98 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
99 *a |= *b;
100 }
101 }
102
103 /// In-place AND (intersection): `self &= other`.
104 pub fn intersect_with(&mut self, other: &DocBitset) {
105 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
106 *a &= *b;
107 }
108 // Zero out any words beyond `other`'s length
109 for a in self.bits.iter_mut().skip(other.bits.len()) {
110 *a = 0;
111 }
112 }
113
114 /// In-place ANDNOT (subtract): `self &= !other`.
115 pub fn subtract(&mut self, other: &DocBitset) {
116 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
117 *a &= !*b;
118 }
119 }
120}
121
122/// Info for MaxScore-optimizable term queries
123#[derive(Debug, Clone)]
124pub struct TermQueryInfo {
125 /// Field being searched
126 pub field: crate::dsl::Field,
127 /// Term bytes (lowercase)
128 pub term: Vec<u8>,
129}
130
131/// Info for MaxScore-optimizable sparse term queries
132#[derive(Debug, Clone, Copy)]
133pub struct SparseTermQueryInfo {
134 /// Sparse vector field
135 pub field: crate::dsl::Field,
136 /// Dimension ID in the sparse vector
137 pub dim_id: u32,
138 /// Query weight for this dimension
139 pub weight: f32,
140 /// MaxScore heap factor (1.0 = exact, lower = approximate)
141 pub heap_factor: f32,
142 /// Multi-value combiner for ordinal deduplication
143 pub combiner: super::MultiValueCombiner,
144 /// Multiplier on executor limit to compensate for ordinal deduplication
145 /// (1.0 = exact, 2.0 = fetch 2x then combine down)
146 pub over_fetch_factor: f32,
147}
148
149/// Decomposition of a query for MaxScore optimization.
150///
151/// The planner inspects this to decide whether to use text MaxScore,
152/// sparse MaxScore, or standard BooleanScorer execution.
153#[derive(Debug, Clone)]
154pub enum QueryDecomposition {
155 /// Single text term — eligible for text MaxScore grouping
156 TextTerm(TermQueryInfo),
157 /// One or more sparse dimensions — eligible for sparse MaxScore
158 SparseTerms(Vec<SparseTermQueryInfo>),
159 /// Not decomposable — falls back to standard execution
160 Opaque,
161}
162
163/// Matched positions for a field (field_id, list of scored positions)
164/// Each position includes its individual score contribution
165pub type MatchedPositions = Vec<(u32, Vec<super::ScoredPosition>)>;
166
167macro_rules! define_query_traits {
168 ($($send_bounds:tt)*) => {
169 /// A search query (async)
170 ///
171 /// Note: `scorer` takes `&self` (not `&'a self`) so that scorers don't borrow the query.
172 /// This enables query composition - queries can create sub-queries locally and get their scorers.
173 /// Implementations must clone/capture any data they need during scorer creation.
174 pub trait Query: std::fmt::Display + $($send_bounds)* {
175 /// Create a scorer for this query against a single segment (async)
176 ///
177 /// The `limit` parameter specifies the maximum number of results to return.
178 /// This is passed from the top-level search limit.
179 ///
180 /// Note: The scorer borrows only the reader, not the query. Implementations
181 /// should capture any needed query data (field, terms, etc.) during creation.
182 fn scorer<'a>(
183 &self,
184 reader: &'a SegmentReader,
185 limit: usize,
186 ) -> ScorerFuture<'a>;
187
188 /// Estimated number of matching documents in a segment (async)
189 fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a>;
190
191 /// Create a scorer synchronously (mmap/RAM only).
192 ///
193 /// Available when the `sync` feature is enabled.
194 /// Default implementation returns an error.
195 #[cfg(feature = "sync")]
196 fn scorer_sync<'a>(
197 &self,
198 reader: &'a SegmentReader,
199 limit: usize,
200 ) -> Result<Box<dyn Scorer + 'a>> {
201 let _ = (reader, limit);
202 Err(crate::error::Error::Query(
203 "sync scorer not supported for this query type".into(),
204 ))
205 }
206
207 /// Decompose this query for MaxScore optimization.
208 ///
209 /// Returns `TextTerm` for simple term queries, `SparseTerms` for
210 /// sparse vector queries (single or multi-dim), or `Opaque` if
211 /// the query cannot be decomposed.
212 fn decompose(&self) -> QueryDecomposition {
213 QueryDecomposition::Opaque
214 }
215
216 /// True if this query is a pure filter (always scores 1.0, no positions).
217 /// Used by the planner to convert non-selective MUST filters into predicates.
218 fn is_filter(&self) -> bool {
219 false
220 }
221
222 /// For filter queries: return a cheap per-doc predicate against a segment.
223 /// The predicate does O(1) work per doc (e.g., fast-field lookup).
224 fn as_doc_predicate<'a>(
225 &self,
226 _reader: &'a SegmentReader,
227 ) -> Option<DocPredicate<'a>> {
228 None
229 }
230
231 /// Build a compact bitset of matching doc_ids for this query.
232 ///
233 /// Preferred over `as_doc_predicate` for BMP filtered queries because
234 /// bitset lookup is ~2ns vs ~30-40ns for a fast-field closure.
235 /// Default returns None; TermQuery overrides this to build from its
236 /// posting list in O(M) time.
237 fn as_doc_bitset(
238 &self,
239 _reader: &SegmentReader,
240 ) -> Option<DocBitset> {
241 None
242 }
243 }
244
245 /// Scored document stream: a DocSet that also provides scores.
246 pub trait Scorer: super::docset::DocSet + $($send_bounds)* {
247 /// Score for current document
248 fn score(&self) -> Score;
249
250 /// Get matched positions for the current document (if available)
251 /// Returns (field_id, positions) pairs where positions are encoded as per PositionMode
252 fn matched_positions(&self) -> Option<MatchedPositions> {
253 None
254 }
255 }
256 };
257}
258
259#[cfg(not(target_arch = "wasm32"))]
260define_query_traits!(Send + Sync);
261
262#[cfg(target_arch = "wasm32")]
263define_query_traits!();
264
265impl Query for Box<dyn Query> {
266 fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
267 (**self).scorer(reader, limit)
268 }
269
270 fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
271 (**self).count_estimate(reader)
272 }
273
274 fn decompose(&self) -> QueryDecomposition {
275 (**self).decompose()
276 }
277
278 fn is_filter(&self) -> bool {
279 (**self).is_filter()
280 }
281
282 fn as_doc_predicate<'a>(&self, reader: &'a SegmentReader) -> Option<DocPredicate<'a>> {
283 (**self).as_doc_predicate(reader)
284 }
285
286 fn as_doc_bitset(&self, reader: &SegmentReader) -> Option<DocBitset> {
287 (**self).as_doc_bitset(reader)
288 }
289
290 #[cfg(feature = "sync")]
291 fn scorer_sync<'a>(
292 &self,
293 reader: &'a SegmentReader,
294 limit: usize,
295 ) -> Result<Box<dyn Scorer + 'a>> {
296 (**self).scorer_sync(reader, limit)
297 }
298}
299
300/// Empty scorer for terms that don't exist
301pub struct EmptyScorer;
302
303impl super::docset::DocSet for EmptyScorer {
304 fn doc(&self) -> DocId {
305 crate::structures::TERMINATED
306 }
307
308 fn advance(&mut self) -> DocId {
309 crate::structures::TERMINATED
310 }
311
312 fn seek(&mut self, _target: DocId) -> DocId {
313 crate::structures::TERMINATED
314 }
315
316 fn size_hint(&self) -> u32 {
317 0
318 }
319}
320
321impl Scorer for EmptyScorer {
322 fn score(&self) -> Score {
323 0.0
324 }
325}