hermes_core/query/traits.rs
1//! Query and Scorer traits with async support
2//!
3//! Provides the core abstractions for search queries and document scoring.
4
5use std::future::Future;
6use std::pin::Pin;
7
8use crate::segment::SegmentReader;
9use crate::{DocId, Result, Score};
10
11/// BM25 parameters
12#[derive(Debug, Clone, Copy)]
13pub struct Bm25Params {
14 /// Term frequency saturation parameter (typically 1.2-2.0)
15 pub k1: f32,
16 /// Length normalization parameter (typically 0.75)
17 pub b: f32,
18}
19
20impl Default for Bm25Params {
21 fn default() -> Self {
22 Self { k1: 1.2, b: 0.75 }
23 }
24}
25
26/// Future type for scorer creation
27#[cfg(not(target_arch = "wasm32"))]
28pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + Send + 'a>>;
29#[cfg(target_arch = "wasm32")]
30pub type ScorerFuture<'a> = Pin<Box<dyn Future<Output = Result<Box<dyn Scorer + 'a>>> + 'a>>;
31
32/// Future type for count estimation
33#[cfg(not(target_arch = "wasm32"))]
34pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + Send + 'a>>;
35#[cfg(target_arch = "wasm32")]
36pub type CountFuture<'a> = Pin<Box<dyn Future<Output = Result<u32>> + 'a>>;
37
38/// Per-document predicate closure type (platform-aware Send+Sync bounds)
39#[cfg(not(target_arch = "wasm32"))]
40pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + Send + Sync + 'a>;
41#[cfg(target_arch = "wasm32")]
42pub type DocPredicate<'a> = Box<dyn Fn(DocId) -> bool + 'a>;
43
44/// Compact bitset indexed by doc_id. O(1) lookup, ~2.25 MB for 18M docs.
45///
46/// Built from posting lists or predicate scans. Used by BMP filtered queries
47/// for fast per-slot predicate evaluation (~2ns per lookup vs ~30-40ns for
48/// a fast-field closure).
49pub struct DocBitset {
50 pub(crate) bits: Vec<u64>,
51}
52
53impl DocBitset {
54 /// Create an empty bitset for `num_docs` documents.
55 pub fn new(num_docs: u32) -> Self {
56 let num_words = (num_docs as usize).div_ceil(64);
57 Self {
58 bits: vec![0u64; num_words],
59 }
60 }
61
62 /// Set bit for `doc_id`.
63 #[inline]
64 pub fn set(&mut self, doc_id: u32) {
65 let word = doc_id as usize / 64;
66 let bit = doc_id as usize % 64;
67 if word < self.bits.len() {
68 self.bits[word] |= 1u64 << bit;
69 }
70 }
71
72 /// Test if `doc_id` is in the bitset.
73 #[inline(always)]
74 pub fn contains(&self, doc_id: u32) -> bool {
75 let word = doc_id as usize / 64;
76 let bit = doc_id as usize % 64;
77 word < self.bits.len() && self.bits[word] & (1u64 << bit) != 0
78 }
79
80 /// Number of set bits (matching docs).
81 pub fn count(&self) -> u32 {
82 self.bits.iter().map(|w| w.count_ones()).sum()
83 }
84
85 /// Build bitset from a predicate by scanning all docs. O(N).
86 pub fn from_predicate(num_docs: u32, pred: &dyn Fn(DocId) -> bool) -> Self {
87 let mut bs = Self::new(num_docs);
88 for doc_id in 0..num_docs {
89 if pred(doc_id) {
90 bs.set(doc_id);
91 }
92 }
93 bs
94 }
95
96 /// In-place OR (union): `self |= other`.
97 pub fn union_with(&mut self, other: &DocBitset) {
98 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
99 *a |= *b;
100 }
101 }
102
103 /// In-place AND (intersection): `self &= other`.
104 pub fn intersect_with(&mut self, other: &DocBitset) {
105 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
106 *a &= *b;
107 }
108 // Zero out any words beyond `other`'s length
109 for a in self.bits.iter_mut().skip(other.bits.len()) {
110 *a = 0;
111 }
112 }
113
114 /// In-place ANDNOT (subtract): `self &= !other`.
115 pub fn subtract(&mut self, other: &DocBitset) {
116 for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
117 *a &= !*b;
118 }
119 }
120}
121
122/// Info for MaxScore-optimizable term queries
123#[derive(Debug, Clone)]
124pub struct TermQueryInfo {
125 /// Field being searched
126 pub field: crate::dsl::Field,
127 /// Term bytes (lowercase)
128 pub term: Vec<u8>,
129}
130
131/// Info for MaxScore-optimizable sparse term queries
132#[derive(Debug, Clone, Copy)]
133pub struct SparseTermQueryInfo {
134 /// Sparse vector field
135 pub field: crate::dsl::Field,
136 /// Dimension ID in the sparse vector
137 pub dim_id: u32,
138 /// Query weight for this dimension
139 pub weight: f32,
140 /// MaxScore heap factor (1.0 = exact, lower = approximate)
141 pub heap_factor: f32,
142 /// Multi-value combiner for ordinal deduplication
143 pub combiner: super::MultiValueCombiner,
144 /// Multiplier on executor limit to compensate for ordinal deduplication
145 /// (1.0 = exact, 2.0 = fetch 2x then combine down)
146 pub over_fetch_factor: f32,
147 /// Maximum superblocks to visit (LSP/0 gamma cap). 0 = unlimited.
148 pub max_superblocks: usize,
149}
150
151/// Decomposition of a query for MaxScore optimization.
152///
153/// The planner inspects this to decide whether to use text MaxScore,
154/// sparse MaxScore, or standard BooleanScorer execution.
155#[derive(Debug, Clone)]
156pub enum QueryDecomposition {
157 /// Single text term — eligible for text MaxScore grouping
158 TextTerm(TermQueryInfo),
159 /// One or more sparse dimensions — eligible for sparse MaxScore
160 SparseTerms(Vec<SparseTermQueryInfo>),
161 /// Not decomposable — falls back to standard execution
162 Opaque,
163}
164
165/// Matched positions for a field (field_id, list of scored positions)
166/// Each position includes its individual score contribution
167pub type MatchedPositions = Vec<(u32, Vec<super::ScoredPosition>)>;
168
169macro_rules! define_query_traits {
170 ($($send_bounds:tt)*) => {
171 /// A search query (async)
172 ///
173 /// Note: `scorer` takes `&self` (not `&'a self`) so that scorers don't borrow the query.
174 /// This enables query composition - queries can create sub-queries locally and get their scorers.
175 /// Implementations must clone/capture any data they need during scorer creation.
176 pub trait Query: std::fmt::Display + $($send_bounds)* {
177 /// Create a scorer for this query against a single segment (async)
178 ///
179 /// The `limit` parameter specifies the maximum number of results to return.
180 /// This is passed from the top-level search limit.
181 ///
182 /// Note: The scorer borrows only the reader, not the query. Implementations
183 /// should capture any needed query data (field, terms, etc.) during creation.
184 fn scorer<'a>(
185 &self,
186 reader: &'a SegmentReader,
187 limit: usize,
188 ) -> ScorerFuture<'a>;
189
190 /// Estimated number of matching documents in a segment (async)
191 fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a>;
192
193 /// Create a scorer synchronously (mmap/RAM only).
194 ///
195 /// Available when the `sync` feature is enabled.
196 /// Default implementation returns an error.
197 #[cfg(feature = "sync")]
198 fn scorer_sync<'a>(
199 &self,
200 reader: &'a SegmentReader,
201 limit: usize,
202 ) -> Result<Box<dyn Scorer + 'a>> {
203 let _ = (reader, limit);
204 Err(crate::error::Error::Query(
205 "sync scorer not supported for this query type".into(),
206 ))
207 }
208
209 /// Decompose this query for MaxScore optimization.
210 ///
211 /// Returns `TextTerm` for simple term queries, `SparseTerms` for
212 /// sparse vector queries (single or multi-dim), or `Opaque` if
213 /// the query cannot be decomposed.
214 fn decompose(&self) -> QueryDecomposition {
215 QueryDecomposition::Opaque
216 }
217
218 /// True if this query is a pure filter (always scores 1.0, no positions).
219 /// Used by the planner to convert non-selective MUST filters into predicates.
220 fn is_filter(&self) -> bool {
221 false
222 }
223
224 /// For filter queries: return a cheap per-doc predicate against a segment.
225 /// The predicate does O(1) work per doc (e.g., fast-field lookup).
226 fn as_doc_predicate<'a>(
227 &self,
228 _reader: &'a SegmentReader,
229 ) -> Option<DocPredicate<'a>> {
230 None
231 }
232
233 /// Build a compact bitset of matching doc_ids for this query.
234 ///
235 /// Preferred over `as_doc_predicate` for BMP filtered queries because
236 /// bitset lookup is ~2ns vs ~30-40ns for a fast-field closure.
237 /// Default returns None; TermQuery overrides this to build from its
238 /// posting list in O(M) time.
239 fn as_doc_bitset(
240 &self,
241 _reader: &SegmentReader,
242 ) -> Option<DocBitset> {
243 None
244 }
245 }
246
247 /// Scored document stream: a DocSet that also provides scores.
248 pub trait Scorer: super::docset::DocSet + $($send_bounds)* {
249 /// Score for current document
250 fn score(&self) -> Score;
251
252 /// Get matched positions for the current document (if available)
253 /// Returns (field_id, positions) pairs where positions are encoded as per PositionMode
254 fn matched_positions(&self) -> Option<MatchedPositions> {
255 None
256 }
257 }
258 };
259}
260
261#[cfg(not(target_arch = "wasm32"))]
262define_query_traits!(Send + Sync);
263
264#[cfg(target_arch = "wasm32")]
265define_query_traits!();
266
267impl Query for Box<dyn Query> {
268 fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
269 (**self).scorer(reader, limit)
270 }
271
272 fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
273 (**self).count_estimate(reader)
274 }
275
276 fn decompose(&self) -> QueryDecomposition {
277 (**self).decompose()
278 }
279
280 fn is_filter(&self) -> bool {
281 (**self).is_filter()
282 }
283
284 fn as_doc_predicate<'a>(&self, reader: &'a SegmentReader) -> Option<DocPredicate<'a>> {
285 (**self).as_doc_predicate(reader)
286 }
287
288 fn as_doc_bitset(&self, reader: &SegmentReader) -> Option<DocBitset> {
289 (**self).as_doc_bitset(reader)
290 }
291
292 #[cfg(feature = "sync")]
293 fn scorer_sync<'a>(
294 &self,
295 reader: &'a SegmentReader,
296 limit: usize,
297 ) -> Result<Box<dyn Scorer + 'a>> {
298 (**self).scorer_sync(reader, limit)
299 }
300}
301
302/// Empty scorer for terms that don't exist
303pub struct EmptyScorer;
304
305impl super::docset::DocSet for EmptyScorer {
306 fn doc(&self) -> DocId {
307 crate::structures::TERMINATED
308 }
309
310 fn advance(&mut self) -> DocId {
311 crate::structures::TERMINATED
312 }
313
314 fn seek(&mut self, _target: DocId) -> DocId {
315 crate::structures::TERMINATED
316 }
317
318 fn size_hint(&self) -> u32 {
319 0
320 }
321}
322
323impl Scorer for EmptyScorer {
324 fn score(&self) -> Score {
325 0.0
326 }
327}