laurus 0.4.2

Unified search library for lexical, vector, and semantic retrieval
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
//! Phrase query implementation for exact phrase matching.

use std::collections::HashMap;
use std::fmt::Debug;

use crate::error::Result;
use crate::lexical::query::Query;
use crate::lexical::query::matcher::{EmptyMatcher, Matcher};
use crate::lexical::query::scorer::{BM25Scorer, Scorer};
use crate::lexical::reader::LexicalIndexReader;

/// A matcher that finds documents containing phrase matches.
#[derive(Debug)]
pub struct PhraseMatcher {
    /// Matching document IDs with phrase frequencies.
    matches: Vec<PhraseMatch>,
    /// Current position in the matches.
    current_index: usize,
    /// Current document ID.
    current_doc_id: u64,
}

/// A phrase match in a specific document.
#[derive(Debug, Clone)]
pub struct PhraseMatch {
    /// Document ID.
    pub doc_id: u64,
    /// Number of phrase occurrences in this document.
    pub phrase_freq: u32,
    /// Positions where the phrase occurs.
    pub positions: Vec<u64>,
}

impl PhraseMatcher {
    /// Create a new phrase matcher.
    pub fn new(
        reader: &dyn LexicalIndexReader,
        field: &str,
        terms: &[String],
        slop: u32,
    ) -> Result<Self> {
        let matches = Self::find_phrase_matches(reader, field, terms, slop)?;

        let current_doc_id = if matches.is_empty() {
            u64::MAX // Invalid state when no matches
        } else {
            matches[0].doc_id
        };

        Ok(PhraseMatcher {
            matches,
            current_index: 0,
            current_doc_id,
        })
    }

    /// Find all documents containing the phrase.
    pub fn find_phrase_matches(
        reader: &dyn LexicalIndexReader,
        field: &str,
        terms: &[String],
        slop: u32,
    ) -> Result<Vec<PhraseMatch>> {
        if terms.is_empty() {
            return Ok(Vec::new());
        }

        // Get posting iterators for all terms
        let mut iterators = Vec::new();
        for term in terms {
            match reader.postings(field, term)? {
                Some(iter) => iterators.push(iter),
                None => return Ok(Vec::new()), // If any term is missing, no phrase matches
            }
        }

        let mut phrase_matches = Vec::new();
        let mut doc_candidates = std::collections::HashMap::new();

        // Find documents that contain all terms
        for (term_idx, iter) in iterators.iter_mut().enumerate() {
            while iter.next()? {
                let doc_id = iter.doc_id();
                if doc_id == u64::MAX {
                    break;
                }

                let positions = iter.positions()?;
                doc_candidates
                    .entry(doc_id)
                    .or_insert_with(Vec::new)
                    .push((term_idx, positions));
            }
        }

        // Check each candidate document for valid phrase matches
        for (doc_id, term_positions) in doc_candidates {
            // Sort by term index to ensure correct order
            let mut term_positions = term_positions;
            term_positions.sort_by_key(|(term_idx, _)| *term_idx);

            // Skip if we don't have all terms
            if term_positions.len() != terms.len() {
                continue;
            }

            // Find valid phrase occurrences in this document
            let phrase_positions = Self::find_phrase_positions(&term_positions, slop);

            if !phrase_positions.is_empty() {
                phrase_matches.push(PhraseMatch {
                    doc_id,
                    phrase_freq: phrase_positions.len() as u32,
                    positions: phrase_positions,
                });
            }
        }

        // Sort matches by document ID
        phrase_matches.sort_by_key(|m| m.doc_id);
        Ok(phrase_matches)
    }

    /// Find valid phrase positions within a document.
    /// Returns the starting positions of valid phrases.
    fn find_phrase_positions(term_positions: &[(usize, Vec<u64>)], slop: u32) -> Vec<u64> {
        if term_positions.is_empty() {
            return Vec::new();
        }

        let mut valid_positions = Vec::new();

        // Get positions for the first term as starting points
        for &start_pos in &term_positions[0].1 {
            if Self::is_valid_phrase_at_position(term_positions, start_pos, slop) {
                valid_positions.push(start_pos);
            }
        }

        valid_positions
    }

    /// Check if there's a valid phrase starting at the given position.
    fn is_valid_phrase_at_position(
        term_positions: &[(usize, Vec<u64>)],
        start_pos: u64,
        slop: u32,
    ) -> bool {
        let mut expected_pos = start_pos;

        for (term_idx, positions) in term_positions {
            if *term_idx == 0 {
                // First term - must be at start_pos
                if !positions.contains(&start_pos) {
                    return false;
                }
            } else {
                // Subsequent terms - must be within slop distance
                expected_pos += 1; // Next expected position

                // Use binary search since positions are sorted.
                let idx = positions.partition_point(|&pos| pos < expected_pos);
                let found_pos = positions
                    .get(idx)
                    .copied()
                    .filter(|&pos| pos <= expected_pos + slop as u64);

                if let Some(actual_pos) = found_pos {
                    expected_pos = actual_pos;
                } else {
                    return false;
                }
            }
        }

        true
    }
}

impl Matcher for PhraseMatcher {
    fn doc_id(&self) -> u64 {
        if self.current_index >= self.matches.len() {
            u64::MAX
        } else {
            self.current_doc_id
        }
    }

    fn next(&mut self) -> Result<bool> {
        if self.current_index >= self.matches.len() {
            return Ok(false);
        }

        self.current_index += 1;

        if self.current_index >= self.matches.len() {
            self.current_doc_id = u64::MAX;
            Ok(false)
        } else {
            self.current_doc_id = self.matches[self.current_index].doc_id;
            Ok(true)
        }
    }

    fn skip_to(&mut self, target: u64) -> Result<bool> {
        if self.matches.is_empty() {
            return Ok(false);
        }

        // Find the first match >= target
        while self.current_index < self.matches.len()
            && self.matches[self.current_index].doc_id < target
        {
            self.current_index += 1;
        }

        if self.current_index >= self.matches.len() {
            self.current_doc_id = u64::MAX;
            Ok(false)
        } else {
            self.current_doc_id = self.matches[self.current_index].doc_id;
            Ok(true)
        }
    }

    fn is_exhausted(&self) -> bool {
        self.current_index >= self.matches.len()
    }

    fn cost(&self) -> u64 {
        self.matches.len() as u64
    }
}

/// A scorer specialized for phrase queries.
#[derive(Debug, Clone)]
pub struct PhraseScorer {
    /// Document frequencies for phrase matches.
    phrase_doc_freq: HashMap<u64, u32>,
    /// Total number of documents.
    total_docs: u64,
    /// Average field length.
    avg_field_length: f64,
    /// Boost factor.
    boost: f32,
    /// BM25 parameters.
    k1: f32,
    b: f32,
}

impl PhraseScorer {
    /// Create a new phrase scorer with phrase match information.
    pub fn new(
        phrase_matches: &[PhraseMatch],
        total_docs: u64,
        avg_field_length: f64,
        boost: f32,
    ) -> Self {
        let mut phrase_doc_freq = HashMap::new();

        // Calculate phrase frequency for each document
        for phrase_match in phrase_matches {
            phrase_doc_freq.insert(phrase_match.doc_id, phrase_match.phrase_freq);
        }

        PhraseScorer {
            phrase_doc_freq,
            total_docs,
            avg_field_length,
            boost,
            k1: 1.2,
            b: 0.75,
        }
    }

    /// Calculate IDF for the phrase.
    fn phrase_idf(&self) -> f32 {
        let phrase_doc_count = self.phrase_doc_freq.len() as f32;
        if phrase_doc_count == 0.0 || self.total_docs == 0 {
            return 0.0;
        }

        let n = self.total_docs as f32;
        let df = phrase_doc_count;

        // Modified IDF calculation for phrases (typically more selective)
        // Ensure the calculation never produces NaN by clamping values
        let base_idf = ((n - df + 0.5) / (df + 0.5)).ln();
        let epsilon = 0.1;

        // Check for NaN and clamp to valid range
        if base_idf.is_nan() || base_idf.is_infinite() {
            return epsilon * 1.2;
        }

        (base_idf + epsilon).max(epsilon) * 1.2 // Boost phrase IDF
    }

    /// Calculate TF component for phrase frequency.
    fn phrase_tf(&self, phrase_freq: f32, field_length: f32) -> f32 {
        if phrase_freq == 0.0 {
            return 0.0;
        }

        let avg_len = self.avg_field_length.max(1.0) as f32; // Ensure avg_len is at least 1
        let field_len = field_length.max(1.0); // Ensure field_length is at least 1
        let norm_factor = 1.0 - self.b + self.b * (field_len / avg_len);

        // Ensure norm_factor is never zero or negative
        let norm_factor = norm_factor.max(0.1);

        // Phrase TF calculation - phrases are more valuable than individual terms
        let enhanced_phrase_freq = phrase_freq * 1.5; // Boost phrase frequency
        let tf = (enhanced_phrase_freq * (self.k1 + 1.0))
            / (enhanced_phrase_freq + self.k1 * norm_factor);

        // Check for NaN and return safe value
        if tf.is_nan() || tf.is_infinite() {
            return 1.0;
        }

        tf
    }
}

impl Scorer for PhraseScorer {
    fn score(&self, doc_id: u64, _term_freq: f32, _field_length: Option<f32>) -> f32 {
        // Use phrase frequency instead of term frequency
        let phrase_freq = self
            .phrase_doc_freq
            .get(&doc_id)
            .map(|&f| f as f32)
            .unwrap_or(0.0);

        if phrase_freq == 0.0 {
            return 0.0;
        }

        let idf = self.phrase_idf();
        let field_length = self.avg_field_length as f32; // Simplified - would be per-document in full implementation
        let tf = self.phrase_tf(phrase_freq, field_length);

        let score = self.boost * idf * tf;

        // Final check: ensure score is never NaN
        if score.is_nan() || score.is_infinite() {
            // Return a reasonable default score for phrase matches
            return self.boost * 1.0;
        }

        score
    }

    fn boost(&self) -> f32 {
        self.boost
    }

    fn set_boost(&mut self, boost: f32) {
        self.boost = boost;
    }

    fn max_score(&self) -> f32 {
        if self.phrase_doc_freq.is_empty() {
            return 0.0;
        }

        let idf = self.phrase_idf();
        let max_tf = self.k1 + 1.0;
        self.boost * idf * max_tf
    }

    fn name(&self) -> &'static str {
        "PhraseScorer"
    }
}

/// A query that matches documents containing an exact phrase.
///
/// A phrase query finds documents where the specified terms appear
/// in the exact order with no other terms between them.
#[derive(Debug, Clone)]
pub struct PhraseQuery {
    /// The field to search in.
    field: String,
    /// The terms that make up the phrase, in order.
    terms: Vec<String>,
    /// The boost factor for this query.
    boost: f32,
    /// Optional slop - maximum allowed distance between terms (0 = exact phrase).
    slop: u32,
}

impl PhraseQuery {
    /// Create a new phrase query.
    pub fn new<S: Into<String>>(field: S, terms: Vec<String>) -> Self {
        PhraseQuery {
            field: field.into(),
            terms,
            boost: 1.0,
            slop: 0,
        }
    }

    /// Create a phrase query from a phrase string.
    pub fn from_phrase<S: Into<String>>(field: S, phrase: &str) -> Self {
        let terms: Vec<String> = phrase.split_whitespace().map(|s| s.to_string()).collect();
        Self::new(field, terms)
    }

    /// Set the boost factor for this query.
    pub fn with_boost(mut self, boost: f32) -> Self {
        self.boost = boost;
        self
    }

    /// Set the slop (maximum distance between terms).
    ///
    /// A slop of 0 means exact phrase match.
    /// A slop of 1 allows one word between phrase terms.
    pub fn with_slop(mut self, slop: u32) -> Self {
        self.slop = slop;
        self
    }

    /// Get the field name.
    pub fn field(&self) -> &str {
        &self.field
    }

    /// Get the phrase terms.
    pub fn terms(&self) -> &[String] {
        &self.terms
    }

    /// Get the slop value.
    pub fn slop(&self) -> u32 {
        self.slop
    }
}

impl Query for PhraseQuery {
    fn matcher(&self, reader: &dyn LexicalIndexReader) -> Result<Box<dyn Matcher>> {
        if self.terms.is_empty() {
            return Ok(Box::new(EmptyMatcher::new()));
        }

        // Create a proper phrase matcher that checks position adjacency
        let phrase_matcher = PhraseMatcher::new(reader, &self.field, &self.terms, self.slop)?;
        Ok(Box::new(phrase_matcher))
    }

    fn scorer(&self, reader: &dyn LexicalIndexReader) -> Result<Box<dyn Scorer>> {
        if self.terms.is_empty() {
            return Ok(Box::new(BM25Scorer::new(0, 0, 0, 1.0, 1, self.boost)));
        }

        let total_docs = reader.doc_count();
        if total_docs == 0 {
            return Ok(Box::new(BM25Scorer::new(0, 0, 0, 1.0, 1, self.boost)));
        }

        // Get actual phrase matches to create accurate scorer
        let phrase_matches =
            PhraseMatcher::find_phrase_matches(reader, &self.field, &self.terms, self.slop)?;

        // Get field statistics
        let avg_field_length = match reader.field_statistics(&self.field) {
            Ok(field_stats) => field_stats.avg_field_length,
            Err(_) => 10.0, // Default fallback
        };

        // Apply boost multiplier for phrase queries (phrases are generally more valuable)
        let phrase_boost = self.boost * (1.0 + 0.2 * (self.terms.len() as f32 - 1.0));

        // Create specialized phrase scorer
        Ok(Box::new(PhraseScorer::new(
            &phrase_matches,
            total_docs,
            avg_field_length,
            phrase_boost,
        )))
    }

    fn boost(&self) -> f32 {
        self.boost
    }

    fn set_boost(&mut self, boost: f32) {
        self.boost = boost;
    }

    fn description(&self) -> String {
        format!(
            "PhraseQuery(field:{}, terms:{:?}, slop:{})",
            self.field, self.terms, self.slop
        )
    }

    fn clone_box(&self) -> Box<dyn Query> {
        Box::new(self.clone())
    }

    fn is_empty(&self, _reader: &dyn LexicalIndexReader) -> Result<bool> {
        Ok(self.terms.is_empty())
    }

    fn cost(&self, _reader: &dyn LexicalIndexReader) -> Result<u64> {
        Ok(self.terms.len() as u64 * 100) // Rough estimate
    }

    fn as_any(&self) -> &dyn std::any::Any {
        self
    }

    fn field(&self) -> Option<&str> {
        Some(&self.field)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_phrase_query_creation() {
        let query = PhraseQuery::new("content", vec!["hello".to_string(), "world".to_string()]);

        assert_eq!(query.field(), "content");
        assert_eq!(query.terms(), &["hello", "world"]);
        assert_eq!(query.slop(), 0);
        assert_eq!(query.boost(), 1.0);
    }

    #[test]
    fn test_phrase_query_from_phrase() {
        let query = PhraseQuery::from_phrase("content", "hello world test");

        assert_eq!(query.field(), "content");
        assert_eq!(query.terms(), &["hello", "world", "test"]);
    }

    #[test]
    fn test_phrase_query_with_boost() {
        let query = PhraseQuery::new("content", vec!["hello".to_string()]).with_boost(2.5);

        assert_eq!(query.boost(), 2.5);
    }

    #[test]
    fn test_phrase_query_with_slop() {
        let query = PhraseQuery::new("content", vec!["hello".to_string(), "world".to_string()])
            .with_slop(2);

        assert_eq!(query.slop(), 2);
    }
}