bearing 0.1.0-alpha.5

A Rust port of Apache Lucene
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
// SPDX-License-Identifier: Apache-2.0

//! Collection pipeline: `Collector`, `LeafCollector`, `CollectorManager`, `ScoreMode`,
//! `DocIdStream`, `SimpleScorable`, and `DocAndFloatFeatureBuffer`.

use std::cell::Cell;
use std::fmt;
use std::io;
use std::rc::Rc;

use super::doc_id_set_iterator::{DocIdSetIterator, NO_MORE_DOCS};
use super::scorable::Scorable;
use crate::index::directory_reader::LeafReaderContext;

// ---------------------------------------------------------------------------
// ScoreMode
// ---------------------------------------------------------------------------

/// Different modes of search.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ScoreMode {
    /// Produced scorers will allow visiting all matches and get their score.
    Complete,
    /// Produced scorers will allow visiting all matches but scores won't be available.
    CompleteNoScores,
    /// Produced scorers will optionally allow skipping over non-competitive hits using the
    /// `Scorer::set_min_competitive_score` API.
    TopScores,
    /// ScoreMode for top field collectors that can provide their own iterators, to optionally
    /// allow to skip for non-competitive docs.
    TopDocs,
    /// ScoreMode for top field collectors that can provide their own iterators, to optionally
    /// allow to skip for non-competitive docs. This mode is used when there is a secondary
    /// sort by score.
    TopDocsWithScores,
}

impl ScoreMode {
    /// Whether this `ScoreMode` needs to compute scores.
    pub fn needs_scores(&self) -> bool {
        match self {
            ScoreMode::Complete => true,
            ScoreMode::CompleteNoScores => false,
            ScoreMode::TopScores => true,
            ScoreMode::TopDocs => false,
            ScoreMode::TopDocsWithScores => true,
        }
    }

    /// Returns `true` if for this `ScoreMode` it is necessary to process all documents, or
    /// `false` if it is enough to go through top documents only.
    pub fn is_exhaustive(&self) -> bool {
        match self {
            ScoreMode::Complete => true,
            ScoreMode::CompleteNoScores => true,
            ScoreMode::TopScores => false,
            ScoreMode::TopDocs => false,
            ScoreMode::TopDocsWithScores => false,
        }
    }
}

// ---------------------------------------------------------------------------
// DocIdStream
// ---------------------------------------------------------------------------

/// A stream of doc IDs. Doc IDs may be consumed at most once.
pub trait DocIdStream {
    /// Iterate over doc IDs contained in this stream up to the given `up_to` exclusive,
    /// calling the given consumer on them. It is not possible to iterate these doc IDs again
    /// later on.
    fn for_each_up_to(
        &mut self,
        up_to: i32,
        consumer: &mut dyn FnMut(i32) -> io::Result<()>,
    ) -> io::Result<()>;

    /// Iterate over all remaining doc IDs in this stream, calling the given consumer on them.
    /// This is a terminal operation.
    fn for_each(&mut self, consumer: &mut dyn FnMut(i32) -> io::Result<()>) -> io::Result<()> {
        self.for_each_up_to(NO_MORE_DOCS, consumer)
    }

    /// Count the number of doc IDs in this stream that are below the given `up_to`. These doc
    /// IDs may not be consumed again later.
    fn count_up_to(&mut self, up_to: i32) -> io::Result<i32>;

    /// Count the number of entries in this stream. This is a terminal operation.
    fn count(&mut self) -> io::Result<i32> {
        self.count_up_to(NO_MORE_DOCS)
    }

    /// Return `true` if this stream may have remaining doc IDs. This must eventually return
    /// `false` when the stream is exhausted.
    fn may_have_remaining(&self) -> bool;
}

/// A `DocIdStream` over a contiguous range `[min, max)`.
#[derive(Debug)]
pub struct RangeDocIdStream {
    up_to: i32,
    max: i32,
}

impl RangeDocIdStream {
    /// Creates a new `RangeDocIdStream` over `[min, max)`.
    ///
    /// # Panics
    ///
    /// Panics if `min >= max`.
    pub fn new(min: i32, max: i32) -> Self {
        assert!(min < max, "min = {} >= max = {}", min, max);
        Self { up_to: min, max }
    }
}

impl DocIdStream for RangeDocIdStream {
    fn for_each_up_to(
        &mut self,
        up_to: i32,
        consumer: &mut dyn FnMut(i32) -> io::Result<()>,
    ) -> io::Result<()> {
        if up_to > self.up_to {
            let up_to = up_to.min(self.max);
            for doc in self.up_to..up_to {
                consumer(doc)?;
            }
            self.up_to = up_to;
        }
        Ok(())
    }

    fn count_up_to(&mut self, up_to: i32) -> io::Result<i32> {
        if up_to > self.up_to {
            let up_to = up_to.min(self.max);
            let count = up_to - self.up_to;
            self.up_to = up_to;
            Ok(count)
        } else {
            Ok(0)
        }
    }

    fn may_have_remaining(&self) -> bool {
        self.up_to < self.max
    }
}

// ---------------------------------------------------------------------------
// LeafCollector
// ---------------------------------------------------------------------------

/// Shared scoring context between the BulkScorer and the LeafCollector.
///
/// In Java, `LeafCollector.setScorer(Scorable)` passes a reference that the collector stores
/// and later calls `scorer.score()` on. In Rust, we can't store the `&mut dyn Scorable`
/// reference across `set_scorer`/`collect` calls due to lifetime constraints.
///
/// `ScoreContext` provides safe shared access via `Cell<f32>`: the BulkScorer writes the
/// current score before each `collect()` call, and the collector reads it. The collector
/// writes `min_competitive_score` to signal the scorer to skip non-competitive docs.
#[derive(Debug)]
pub struct ScoreContext {
    /// The current document's score. Written by the BulkScorer before `collect()`.
    pub score: Cell<f32>,
    /// The minimum competitive score. Written by the collector, read by the BulkScorer
    /// to propagate to the scorer via `Scorer::set_min_competitive_score`.
    pub min_competitive_score: Cell<f32>,
}

impl ScoreContext {
    /// Creates a new `ScoreContext` with zeroed values.
    pub fn new() -> Rc<Self> {
        Rc::new(Self {
            score: Cell::new(0.0),
            min_competitive_score: Cell::new(0.0),
        })
    }
}

/// Per-segment collector that receives matching documents and their scores.
///
/// The `set_scorer` method is called before collection begins, passing a shared `ScoreContext`.
/// The BulkScorer writes the current document's score to the context before calling `collect()`.
/// Collectors that need the score read it from the context.
pub trait LeafCollector: fmt::Debug {
    /// Called before successive calls to `collect`. The `score_context` is shared between the
    /// BulkScorer (which writes the score) and this collector (which reads it and may write
    /// `min_competitive_score`).
    fn set_scorer(&mut self, score_context: Rc<ScoreContext>) -> io::Result<()>;

    /// Called once for every document matching a query, with the unbased document number.
    fn collect(&mut self, doc: i32) -> io::Result<()>;

    /// Collect a range of doc IDs, between `min` inclusive and `max` exclusive. `max` is
    /// guaranteed to be greater than `min`.
    ///
    /// The default implementation calls `collect_stream` on a `RangeDocIdStream`.
    fn collect_range(&mut self, min: i32, max: i32) -> io::Result<()> {
        let mut stream = RangeDocIdStream::new(min, max);
        self.collect_stream(&mut stream)
    }

    /// Bulk-collect doc IDs from a `DocIdStream`.
    ///
    /// The default implementation buffers doc IDs from the stream and then collects them.
    fn collect_stream(&mut self, stream: &mut dyn DocIdStream) -> io::Result<()> {
        let mut docs = Vec::new();
        stream.for_each(&mut |doc| {
            docs.push(doc);
            Ok(())
        })?;
        for doc in docs {
            self.collect(doc)?;
        }
        Ok(())
    }

    /// Optionally returns an iterator over competitive documents. Returns `None` by default.
    fn competitive_iterator(&self) -> Option<Box<dyn DocIdSetIterator>> {
        None
    }

    /// Hook that gets called once the leaf that is associated with this collector has finished
    /// collecting successfully. The default implementation does nothing.
    fn finish(&mut self) -> io::Result<()> {
        Ok(())
    }
}

// ---------------------------------------------------------------------------
// Collector
// ---------------------------------------------------------------------------

/// Expert: Collectors are primarily meant to be used to gather raw results from a search, and
/// implement sorting or custom result filtering, collation, etc.
pub trait Collector: fmt::Debug {
    /// The type of `LeafCollector` this collector creates.
    type Leaf: LeafCollector;

    /// Create a new `LeafCollector` to collect the given context.
    fn get_leaf_collector(&mut self, context: &LeafReaderContext) -> io::Result<Self::Leaf>;

    /// Indicates what features are required from the scorer.
    fn score_mode(&self) -> ScoreMode;
}

/// A manager of collectors. This is useful to parallelize execution of search requests.
///
/// - `new_collector()` must return a NEW collector which will be used to collect a certain
///   set of leaves.
/// - `reduce()` will be used to reduce the results of individual collections into a
///   meaningful result. This method is only called after all leaves have been fully collected.
pub trait CollectorManager: fmt::Debug {
    /// The type of `Collector` this manager creates.
    type Coll: Collector;
    /// The result type produced by `reduce`.
    type Result;

    /// Return a new `Collector`. This must return a different instance on each call.
    fn new_collector(&self) -> io::Result<Self::Coll>;

    /// Reduce the results of individual collectors into a meaningful result.
    fn reduce(&self, collectors: Vec<Self::Coll>) -> io::Result<Self::Result>;
}

// ---------------------------------------------------------------------------
// SimpleScorable
// ---------------------------------------------------------------------------

/// Simplest implementation of `Scorable`, implemented via simple getters and setters.
#[derive(Debug)]
pub struct SimpleScorable {
    /// The current score.
    pub score: f32,
    /// The minimum competitive score.
    pub min_competitive_score: f32,
}

impl SimpleScorable {
    /// Sole constructor.
    pub fn new() -> Self {
        Self {
            score: 0.0,
            min_competitive_score: 0.0,
        }
    }

    /// Set the score.
    pub fn set_score(&mut self, score: f32) {
        self.score = score;
    }

    /// Get the min competitive score.
    pub fn min_competitive_score(&self) -> f32 {
        self.min_competitive_score
    }
}

impl Default for SimpleScorable {
    fn default() -> Self {
        Self::new()
    }
}

impl Scorable for SimpleScorable {
    fn score(&mut self) -> io::Result<f32> {
        Ok(self.score)
    }

    fn set_min_competitive_score(&mut self, min_score: f32) -> io::Result<()> {
        self.min_competitive_score = min_score;
        Ok(())
    }
}

// ---------------------------------------------------------------------------
// DocAndFloatFeatureBuffer
// ---------------------------------------------------------------------------

/// Wrapper around parallel arrays storing doc IDs and their corresponding features, stored as
/// `f32`. These features may be anything, but are typically a term frequency or a score.
pub struct DocAndFloatFeatureBuffer {
    /// Doc IDs.
    pub docs: Vec<i32>,
    /// Float-valued features.
    pub features: Vec<f32>,
    /// Number of valid entries in the doc ID and float-valued feature arrays.
    pub size: usize,
}

impl fmt::Debug for DocAndFloatFeatureBuffer {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("DocAndFloatFeatureBuffer")
            .field("size", &self.size)
            .field("capacity", &self.docs.len())
            .finish()
    }
}

impl DocAndFloatFeatureBuffer {
    /// Sole constructor.
    pub fn new() -> Self {
        Self {
            docs: Vec::new(),
            features: Vec::new(),
            size: 0,
        }
    }

    /// Grow both arrays to ensure that they can store at least the given number of entries.
    /// Does not preserve existing contents.
    pub fn grow_no_copy(&mut self, min_size: usize) {
        if self.docs.len() < min_size {
            self.docs.resize(min_size, 0);
            self.features.resize(self.docs.len(), 0.0);
        }
    }
}

impl Default for DocAndFloatFeatureBuffer {
    fn default() -> Self {
        Self::new()
    }
}

// ---------------------------------------------------------------------------
// DocAndScoreAccBuffer
// ---------------------------------------------------------------------------

/// Wrapper around parallel arrays storing doc IDs and their corresponding score accumulators,
/// stored as `f64` for precision during multi-clause score accumulation.
pub struct DocAndScoreAccBuffer {
    /// Doc IDs.
    pub docs: Vec<i32>,
    /// Scores (double-precision accumulators).
    pub scores: Vec<f64>,
    /// Number of valid entries in the doc ID and score arrays.
    pub size: usize,
}

impl fmt::Debug for DocAndScoreAccBuffer {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("DocAndScoreAccBuffer")
            .field("size", &self.size)
            .field("capacity", &self.docs.len())
            .finish()
    }
}

impl DocAndScoreAccBuffer {
    /// Sole constructor.
    pub fn new() -> Self {
        Self {
            docs: Vec::new(),
            scores: Vec::new(),
            size: 0,
        }
    }

    /// Grow both arrays to ensure that they can store at least the given number of entries.
    /// Existing content may be discarded.
    pub fn grow_no_copy(&mut self, min_size: usize) {
        if self.docs.len() < min_size {
            self.docs.resize(min_size, 0);
            self.scores = vec![0.0; self.docs.len()];
        }
    }

    /// Grow both arrays to ensure that they can store at least the given number of entries.
    /// Existing content is preserved.
    pub fn grow(&mut self, min_size: usize) {
        if self.docs.len() < min_size {
            self.docs.resize(min_size, 0);
            self.scores.resize(self.docs.len(), 0.0);
        }
    }

    /// Copy content from the given `DocAndFloatFeatureBuffer`, expanding float scores to
    /// doubles.
    pub fn copy_from(&mut self, buffer: &DocAndFloatFeatureBuffer) {
        self.grow_no_copy(buffer.size);
        self.docs[..buffer.size].copy_from_slice(&buffer.docs[..buffer.size]);
        for i in 0..buffer.size {
            self.scores[i] = buffer.features[i] as f64;
        }
        self.size = buffer.size;
    }
}

impl Default for DocAndScoreAccBuffer {
    fn default() -> Self {
        Self::new()
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // -- DocIdStream tests --

    #[test]
    fn test_range_stream_for_each() {
        let mut stream = RangeDocIdStream::new(5, 8);
        let mut docs = Vec::new();
        stream
            .for_each(&mut |doc| {
                docs.push(doc);
                Ok(())
            })
            .unwrap();
        assert_eq!(docs, vec![5, 6, 7]);
        assert!(!stream.may_have_remaining());
    }

    #[test]
    fn test_range_stream_for_each_up_to() {
        let mut stream = RangeDocIdStream::new(0, 10);
        let mut docs = Vec::new();
        stream
            .for_each_up_to(5, &mut |doc| {
                docs.push(doc);
                Ok(())
            })
            .unwrap();
        assert_eq!(docs, vec![0, 1, 2, 3, 4]);
        assert!(stream.may_have_remaining());

        docs.clear();
        stream
            .for_each(&mut |doc| {
                docs.push(doc);
                Ok(())
            })
            .unwrap();
        assert_eq!(docs, vec![5, 6, 7, 8, 9]);
        assert!(!stream.may_have_remaining());
    }

    #[test]
    fn test_range_stream_count() {
        let mut stream = RangeDocIdStream::new(0, 10);
        assert_eq!(stream.count().unwrap(), 10);
        assert!(!stream.may_have_remaining());
    }

    #[test]
    fn test_range_stream_count_up_to() {
        let mut stream = RangeDocIdStream::new(0, 10);
        assert_eq!(stream.count_up_to(5).unwrap(), 5);
        assert!(stream.may_have_remaining());
        assert_eq!(stream.count_up_to(5).unwrap(), 0);
        assert_eq!(stream.count_up_to(20).unwrap(), 5);
        assert!(!stream.may_have_remaining());
    }

    #[test]
    #[should_panic(expected = "min = 5 >= max = 5")]
    fn test_range_stream_invalid() {
        RangeDocIdStream::new(5, 5);
    }

    // -- SimpleScorable tests --

    #[test]
    fn test_simple_scorable_default_score() {
        let mut s = SimpleScorable::new();
        assert_eq!(s.score().unwrap(), 0.0);
    }

    #[test]
    fn test_simple_scorable_set_and_get_score() {
        let mut s = SimpleScorable::new();
        s.set_score(2.5);
        assert_eq!(s.score().unwrap(), 2.5);
    }

    #[test]
    fn test_simple_scorable_min_competitive_score() {
        let mut s = SimpleScorable::new();
        assert_eq!(s.min_competitive_score(), 0.0);
        s.set_min_competitive_score(1.0).unwrap();
        assert_eq!(s.min_competitive_score(), 1.0);
    }

    // -- DocAndFloatFeatureBuffer tests --

    #[test]
    fn test_feature_buffer_new() {
        let buf = DocAndFloatFeatureBuffer::new();
        assert_eq!(buf.size, 0);
        assert_is_empty!(buf.docs);
        assert_is_empty!(buf.features);
    }

    #[test]
    fn test_feature_buffer_grow_no_copy() {
        let mut buf = DocAndFloatFeatureBuffer::new();
        buf.grow_no_copy(128);
        assert_ge!(buf.docs.len(), 128);
        assert_ge!(buf.features.len(), 128);
    }

    #[test]
    fn test_feature_buffer_grow_no_copy_already_large_enough() {
        let mut buf = DocAndFloatFeatureBuffer::new();
        buf.grow_no_copy(128);
        let old_len = buf.docs.len();
        buf.grow_no_copy(64);
        assert_eq!(buf.docs.len(), old_len);
    }

    // -- DocAndScoreAccBuffer tests --

    #[test]
    fn test_score_acc_buffer_new() {
        let buf = DocAndScoreAccBuffer::new();
        assert_eq!(buf.size, 0);
        assert_is_empty!(buf.docs);
        assert_is_empty!(buf.scores);
    }

    #[test]
    fn test_score_acc_buffer_grow_no_copy() {
        let mut buf = DocAndScoreAccBuffer::new();
        buf.grow_no_copy(128);
        assert_ge!(buf.docs.len(), 128);
        assert_ge!(buf.scores.len(), 128);
    }

    #[test]
    fn test_score_acc_buffer_grow_preserves() {
        let mut buf = DocAndScoreAccBuffer::new();
        buf.grow(4);
        buf.docs[0] = 42;
        buf.scores[0] = 1.5;
        buf.size = 1;
        buf.grow(128);
        assert_ge!(buf.docs.len(), 128);
        assert_eq!(buf.docs[0], 42);
        assert_in_delta!(buf.scores[0], 1.5, 1e-10);
    }

    #[test]
    fn test_score_acc_buffer_copy_from() {
        let mut float_buf = DocAndFloatFeatureBuffer::new();
        float_buf.grow_no_copy(3);
        float_buf.docs[0] = 10;
        float_buf.docs[1] = 20;
        float_buf.docs[2] = 30;
        float_buf.features[0] = 1.5;
        float_buf.features[1] = 2.5;
        float_buf.features[2] = 3.5;
        float_buf.size = 3;

        let mut acc_buf = DocAndScoreAccBuffer::new();
        acc_buf.copy_from(&float_buf);

        assert_eq!(acc_buf.size, 3);
        assert_eq!(acc_buf.docs[0], 10);
        assert_eq!(acc_buf.docs[1], 20);
        assert_eq!(acc_buf.docs[2], 30);
        assert_in_delta!(acc_buf.scores[0], 1.5, 1e-10);
        assert_in_delta!(acc_buf.scores[1], 2.5, 1e-10);
        assert_in_delta!(acc_buf.scores[2], 3.5, 1e-10);
    }
}