doctrine 0.4.1

Project tooling CLI
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
// SPDX-License-Identifier: GPL-3.0-only
//! Pure lexical leaf (SL-017, design §5). The pluggable lexical axis behind the
//! SL-008 9-key `sort_key`: a tokenizer, the `LexicalRanker` trait, today's
//! set-membership `OverlapRanker`, and the `f32 → u32` `quantize`. Imports
//! nothing of `retrieve` or `memory` (ADR-001 layering, design D6) — `retrieve`
//! adapts `Memory` into the leaf's `LexDoc`, never the reverse.
//!
//! PHASE-04 wired `Bm25Ranker` into `retrieve` as the hard default ranker, so the
//! staged `dead_code` bridge has self-cleared. `OverlapRanker` has no production
//! caller post-wiring (BM25 is the only runtime ranker, no selector — design D5),
//! so it lives behind `#[cfg(test)]` as the behaviour-preservation instrument that
//! proves the seam extraction did not alter overlap (design §5.4 retirement).

use bm25::{EmbedderBuilder, Scorer, Tokenizer};
use std::collections::{BTreeMap, BTreeSet};

/// Case-fold + split on non-alphanumeric, dropping empties. The shared lexer for
/// the lexical axis — `mem.foo.bar` / `src/x.rs` split on their separators too.
/// General-purpose: `retrieve` imports it; this leaf imports nothing of retrieval.
pub(crate) fn tokenize(s: &str) -> Vec<String> {
    s.split(|c: char| !c.is_alphanumeric())
        .filter(|t| !t.is_empty())
        .map(str::to_lowercase)
        .collect()
}

/// A memory-agnostic scoring document. `id` is the memory uid (caller-canonical,
/// assumed unique). `text` is the pre-concatenated doc bag (design §5.3).
pub(crate) struct LexDoc {
    pub(crate) id: String,
    pub(crate) text: String,
}

/// The fit-corpus view. One variant in SL-017; `Indexed(&LexicalIndex)` is the
/// non-breaking evolution when an active-set index is precomputed (follow-up, D7).
pub(crate) enum LexicalCorpus<'a> {
    Raw(&'a [LexDoc]),
}

impl LexicalCorpus<'_> {
    fn docs(&self) -> &[LexDoc] {
        match self {
            LexicalCorpus::Raw(docs) => docs,
        }
    }
}

pub(crate) trait LexicalRanker {
    /// Fit corpus-level statistics over `corpus`, score `targets`, and return one
    /// `(id, u32)` entry **per target, in `targets` order** (Key-2 magnitude).
    ///
    /// Contract (A1 — completeness is total, not best-effort):
    /// - The result has **exactly `targets.len()` entries, one per target, in
    ///   order.** A no-evidence target (query `None` / empty after tokenize / no
    ///   term matched) is filled with `0` *inside the ranker*. Candidate assembly
    ///   indexes positionally — it never uses `unwrap_or(0)`; an absent entry is a
    ///   bug.
    /// - **Hard precondition (all builds):** `assert!` every target id is present
    ///   in `corpus` (targets ⊆ corpus). A target outside the fit corpus is an
    ///   internal invariant violation (survivors ⊆ active = corpus) — fail loud,
    ///   never silently demote to 0.
    /// - Duplicate ids in `corpus` violate uniqueness (`debug_assert!`; release is
    ///   last-wins, undefined-but-safe).
    /// - Future (A4): an `Indexed` corpus may fail at *construction* time; `Raw`
    ///   never does.
    fn score(
        &self,
        query: Option<&str>,
        corpus: &LexicalCorpus<'_>,
        targets: &[&str],
    ) -> Vec<(String, u32)>;
}

/// Distinct-query-token set-membership over `LexDoc.text` (re-tokenized). Returns
/// the `u32` count directly — **no quantize** — byte-identical to the retired
/// SL-008 `lexical_score` (design §5.4). Corpus statistics are ignored: this is a
/// per-document signal. Test-only post-PHASE-04: BM25 is the runtime ranker (D5),
/// so this survives solely as the behaviour-preservation instrument.
#[cfg(test)]
pub(crate) struct OverlapRanker;

/// BM25 f32 (>= 0 under Lucene IDF) -> Key-2 u32 (design §5.2). Monotonic
/// non-decreasing, saturating, total (non-finite -> 0: invalid evidence, never
/// maximal).
pub(crate) const LEX_SCALE: f32 = 1_000_000.0;

pub(crate) fn quantize(score: f32) -> u32 {
    debug_assert!(score.is_finite(), "non-finite lexical score: scorer bug"); // A8: surface upstream bugs
    if !score.is_finite() {
        return 0;
    }
    let scaled = (score.max(0.0) * LEX_SCALE).round();
    // Rust's float→int `as` is *saturating* (since 1.45): out-of-range → u32::MAX,
    // and `scaled` is finite and >= 0 here, so this is total, monotonic, and the
    // saturation the design specifies — no separate ceiling branch needed. There
    // is no safe std float→int API; this guarded `as` carries the house-style
    // expect+reason (the only `as` in `src`, mem.pattern.lint.expect-not-allow).
    #[expect(
        clippy::as_conversions,
        clippy::cast_possible_truncation,
        clippy::cast_sign_loss,
        reason = "saturating float→u32 (Rust >= 1.45); scaled is finite and >= 0; no safe std API"
    )]
    let q = scaled as u32;
    q
}

/// Shared hard precondition (A1): every target must be present in the fit corpus,
/// and corpus ids must be unique. Used by every `LexicalRanker` impl.
fn assert_targets_subset(corpus: &LexicalCorpus<'_>, targets: &[&str]) {
    let docs = corpus.docs();
    debug_assert!(
        docs.iter()
            .map(|d| d.id.as_str())
            .collect::<BTreeSet<_>>()
            .len()
            == docs.len(),
        "duplicate corpus ids violate uniqueness"
    );
    let ids: BTreeSet<&str> = docs.iter().map(|d| d.id.as_str()).collect();
    for t in targets {
        assert!(
            ids.contains(t),
            "target id not in fit corpus (targets ⊆ corpus violated)"
        );
    }
}

#[cfg(test)]
impl LexicalRanker for OverlapRanker {
    fn score(
        &self,
        query: Option<&str>,
        corpus: &LexicalCorpus<'_>,
        targets: &[&str],
    ) -> Vec<(String, u32)> {
        assert_targets_subset(corpus, targets);
        let q_tokens: BTreeSet<String> = match query {
            Some(q) => tokenize(q).into_iter().collect(),
            None => BTreeSet::new(),
        };
        let by_id: BTreeMap<&str, &str> = corpus
            .docs()
            .iter()
            .map(|d| (d.id.as_str(), d.text.as_str()))
            .collect();
        targets
            .iter()
            .map(|t| {
                let hits = match (q_tokens.is_empty(), by_id.get(t)) {
                    (false, Some(text)) => {
                        let bag: BTreeSet<String> = tokenize(text).into_iter().collect();
                        q_tokens.iter().filter(|qt| bag.contains(*qt)).count()
                    }
                    _ => 0,
                };
                ((*t).to_string(), u32::try_from(hits).unwrap_or(u32::MAX))
            })
            .collect()
    }
}

/// The bm25 embedding space `D`. `DefaultTokenEmbedder`/`DefaultEmbeddingSpace`
/// are unexported, so we name a concrete hash space — `u32`, as the crate's own
/// tests do. The space is irrelevant to the BM25 maths (PHASE-01 probe).
type Space = u32;

/// The custom bm25 `Tokenizer` — a ZST adapter delegating to `lexical::tokenize`,
/// so there is ONE lexer across the lexical axis (design D2). No `DefaultTokenizer`
/// / stemming / stopwords; `--no-default-features` keeps that stack absent.
struct LexTokenizer;

impl Tokenizer for LexTokenizer {
    fn tokenize(&self, input_text: &str) -> Vec<String> {
        tokenize(input_text) // the module free fn, not this method (no receiver → no recursion)
    }
}

/// Corpus-relative Okapi BM25 behind the trait (design §5.4). Fits `avgdl`/IDF
/// over the FULL fit corpus via `with_tokenizer_and_fit_to_corpus` (the avgdl /
/// `doc_len` equivalence is STRUCTURAL — same tokenizer both sides, PHASE-01
/// Finding A), scores targets, `quantize`s each match, fills `0` for no-evidence
/// targets (A1). bm25's own descending order is discarded — doctrine's `sort_key`
/// re-orders downstream. Scores are non-negative (Lucene IDF), so `quantize`'s
/// `max(0)` is defensive only.
pub(crate) struct Bm25Ranker;

/// One `(id, 0)` per target, in order — the no-evidence / empty-input result (A1
/// totality without invoking bm25; also the avgdl div-by-zero guard, §5.4 step 1).
fn zeros(targets: &[&str]) -> Vec<(String, u32)> {
    targets.iter().map(|t| ((*t).to_string(), 0)).collect()
}

impl LexicalRanker for Bm25Ranker {
    fn score(
        &self,
        query: Option<&str>,
        corpus: &LexicalCorpus<'_>,
        targets: &[&str],
    ) -> Vec<(String, u32)> {
        assert_targets_subset(corpus, targets);
        let docs = corpus.docs();
        let q = match query {
            Some(q) if !tokenize(q).is_empty() && !docs.is_empty() => q,
            _ => return zeros(targets), // None / empty-after-tokenize / empty corpus
        };
        // Fit avgdl/IDF over the whole corpus (D3), then score the query. The
        // builder computes avgdl with the SAME tokenizer the embedder uses per
        // doc (structural A3); we consume id→score and discard bm25's ordering.
        let texts: Vec<&str> = docs.iter().map(|d| d.text.as_str()).collect();
        let embedder =
            EmbedderBuilder::<Space, _>::with_tokenizer_and_fit_to_corpus(LexTokenizer, &texts)
                .build();
        let mut scorer = Scorer::<String, Space>::new();
        for d in docs {
            scorer.upsert(&d.id, embedder.embed(d.text.as_str()));
        }
        let matched = scorer.matches(&embedder.embed(q));
        let scored: BTreeMap<&str, f32> =
            matched.iter().map(|m| (m.id.as_str(), m.score)).collect();
        targets
            .iter()
            .map(|t| {
                (
                    (*t).to_string(),
                    quantize(scored.get(t).copied().unwrap_or(0.0)),
                )
            })
            .collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // -- tokenize (the shared lexical lexer, moved from retrieve) ------------

    #[test]
    fn tokenize_casefolds_and_splits_on_non_alphanumeric() {
        assert_eq!(tokenize("Auth Bug"), vec!["auth", "bug"]);
        // punctuation, underscore and slash all split; empties dropped
        assert_eq!(
            tokenize("src/memory.rs__OK"),
            vec!["src", "memory", "rs", "ok"]
        );
        // key segments split on the dot
        assert_eq!(tokenize("mem.auth.token"), vec!["mem", "auth", "token"]);
        assert!(tokenize("   ").is_empty());
    }

    // -- quantize (design §5.2 / §9, VT-3) ----------------------------------

    #[test]
    fn quantize_zero_is_zero() {
        assert_eq!(quantize(0.0), 0);
    }

    #[test]
    fn quantize_is_monotonic_non_decreasing() {
        let xs = [0.0_f32, 1e-6, 0.1, 1.0, 2.5, 30.0, 1000.0];
        for w in xs.windows(2) {
            assert!(
                quantize(w[0]) <= quantize(w[1]),
                "quantize not monotonic at {:?}",
                w
            );
        }
    }

    #[test]
    fn quantize_saturates() {
        assert_eq!(quantize(f32::MAX), u32::MAX);
        // anything scaling past the ceiling saturates, not truncates/wraps
        assert_eq!(quantize(1e30), u32::MAX);
    }

    // Non-finite is profile-split (design §5.2 clarification): a non-finite BM25
    // score is a scorer bug, not ordinary input — debug explodes (A8); release
    // stays total and degrades to 0 so invalid evidence is never maximal.
    #[test]
    #[cfg(debug_assertions)]
    #[should_panic(expected = "non-finite lexical score")]
    fn quantize_non_finite_panics_in_debug() {
        let _ = quantize(f32::NAN);
    }

    #[test]
    #[cfg(not(debug_assertions))]
    fn quantize_non_finite_is_zero_in_release() {
        assert_eq!(quantize(f32::NAN), 0);
        assert_eq!(quantize(f32::INFINITY), 0);
    }

    #[test]
    fn quantize_negative_is_zero() {
        // the defensive `max(0.0)` — a finite negative, so no debug_assert conflict
        assert_eq!(quantize(-1.0), 0);
    }

    // -- OverlapRanker (design §5.4, VT-2) ----------------------------------

    fn doc(id: &str, text: &str) -> LexDoc {
        LexDoc {
            id: id.to_string(),
            text: text.to_string(),
        }
    }

    #[test]
    fn overlap_counts_distinct_query_tokens_over_text() {
        let docs = vec![doc("a", "token expiry middleware check rust mem auth flow")];
        let corpus = LexicalCorpus::Raw(&docs);
        // 4 distinct hits
        assert_eq!(
            OverlapRanker.score(Some("token middleware rust auth"), &corpus, &["a"]),
            vec![("a".to_string(), 4)]
        );
        // a repeated query token counts once (SET semantics)
        assert_eq!(
            OverlapRanker.score(Some("token token token"), &corpus, &["a"]),
            vec![("a".to_string(), 1)]
        );
        // no overlap ⇒ 0
        assert_eq!(
            OverlapRanker.score(Some("python django"), &corpus, &["a"]),
            vec![("a".to_string(), 0)]
        );
    }

    #[test]
    fn overlap_no_query_is_all_zero() {
        let docs = vec![doc("a", "token"), doc("b", "auth")];
        let corpus = LexicalCorpus::Raw(&docs);
        assert_eq!(
            OverlapRanker.score(None, &corpus, &["a", "b"]),
            vec![("a".to_string(), 0), ("b".to_string(), 0)]
        );
        // empty / separators-only query tokenizes to nothing ⇒ 0
        assert_eq!(
            OverlapRanker.score(Some(""), &corpus, &["a"]),
            vec![("a".to_string(), 0)]
        );
        assert_eq!(
            OverlapRanker.score(Some("   ... "), &corpus, &["a"]),
            vec![("a".to_string(), 0)]
        );
    }

    #[test]
    fn overlap_is_positional_over_targets() {
        let docs = vec![doc("a", "token"), doc("b", "auth"), doc("c", "rust")];
        let corpus = LexicalCorpus::Raw(&docs);
        // result follows targets order, one entry each, corpus order ignored
        let got = OverlapRanker.score(Some("rust token"), &corpus, &["c", "a"]);
        assert_eq!(got, vec![("c".to_string(), 1), ("a".to_string(), 1)]);
        assert_eq!(got.len(), 2);
    }

    #[test]
    fn overlap_empty_targets_is_empty_vec() {
        let docs = vec![doc("a", "token")];
        let corpus = LexicalCorpus::Raw(&docs);
        assert!(OverlapRanker.score(Some("token"), &corpus, &[]).is_empty());
    }

    // -- contract invariants (A1, VT-4) -------------------------------------

    #[test]
    #[should_panic(expected = "targets ⊆ corpus")]
    fn target_outside_corpus_panics_in_all_builds() {
        let docs = vec![doc("a", "token")];
        let corpus = LexicalCorpus::Raw(&docs);
        let _ = OverlapRanker.score(Some("token"), &corpus, &["ghost"]);
    }

    #[test]
    #[should_panic(expected = "duplicate corpus ids")]
    fn duplicate_corpus_ids_trip_debug_assert() {
        let docs = vec![doc("a", "token"), doc("a", "other")];
        let corpus = LexicalCorpus::Raw(&docs);
        let _ = OverlapRanker.score(Some("token"), &corpus, &["a"]);
    }

    // -- Bm25Ranker (design §5.4, §9 test list; VT-1..VT-5) -----------------

    // VT-1 — IDF: a target on a RARE corpus term outranks one on a UBIQUITOUS
    // term, length held equal. ("common" df=3, "rare" df=1; both targets len 2.)
    #[test]
    fn bm25_idf_rare_outranks_common() {
        let docs = vec![
            doc("a", "common alpha"),
            doc("b", "rare beta"),
            doc("c", "common gamma"),
            doc("d", "common delta"),
        ];
        let corpus = LexicalCorpus::Raw(&docs);
        let got = Bm25Ranker.score(Some("common rare"), &corpus, &["a", "b"]);
        let (sa, sb) = (got[0].1, got[1].1);
        assert!(
            sb > sa,
            "rare-term target must outrank common-term: a={sa} b={sb}"
        );
    }

    // VT-2 — length normalisation (b-effect): equal TF, the SHORTER doc outranks
    // the longer. Both contain "term" once; avgdl=(1+5)/2=3, short<avgdl<long.
    #[test]
    fn bm25_length_norm_shorter_outranks_longer() {
        let docs = vec![
            doc("short", "term"),
            doc("long", "term alpha beta gamma delta"),
        ];
        let corpus = LexicalCorpus::Raw(&docs);
        let got = Bm25Ranker.score(Some("term"), &corpus, &["short", "long"]);
        assert!(
            got[0].1 > got[1].1,
            "shorter doc must outrank longer at equal TF: {got:?}"
        );
    }

    // VT-3 — determinism: permuted corpus/upsert order ⇒ identical per-target
    // scores (positional over targets, so corpus order cannot leak); and a
    // same-process repeat call is byte-identical (OQ-5/R7, by construction).
    #[test]
    fn bm25_is_shuffle_invariant_and_repeatable() {
        let d1 = vec![
            doc("a", "alpha beta"),
            doc("b", "beta gamma gamma"),
            doc("c", "alpha"),
        ];
        let d2 = vec![
            doc("c", "alpha"),
            doc("a", "alpha beta"),
            doc("b", "beta gamma gamma"),
        ];
        let c1 = LexicalCorpus::Raw(&d1);
        let c2 = LexicalCorpus::Raw(&d2);
        let targets = ["a", "b", "c"];
        let r1 = Bm25Ranker.score(Some("beta gamma"), &c1, &targets);
        let r2 = Bm25Ranker.score(Some("beta gamma"), &c2, &targets);
        assert_eq!(
            r1, r2,
            "permuted corpus order must yield identical per-target scores"
        );
        assert_eq!(r1, Bm25Ranker.score(Some("beta gamma"), &c1, &targets));
    }

    // VT-4 — edges: None / "" / separators-only / empty corpus ⇒ all-zero with
    // exactly targets.len() entries; a survivor no query term touches ⇒ 0; df is
    // taken over the FULL corpus, not the target subset (D3).
    #[test]
    fn bm25_edges_are_all_zero_with_exact_arity() {
        let docs = vec![doc("a", "token auth"), doc("b", "rust")];
        let corpus = LexicalCorpus::Raw(&docs);
        let z = vec![("a".to_string(), 0), ("b".to_string(), 0)];
        assert_eq!(Bm25Ranker.score(None, &corpus, &["a", "b"]), z);
        assert_eq!(Bm25Ranker.score(Some(""), &corpus, &["a", "b"]), z);
        assert_eq!(Bm25Ranker.score(Some("  ... "), &corpus, &["a", "b"]), z);
    }

    #[test]
    fn bm25_empty_corpus_returns_empty_vec() {
        let docs: Vec<LexDoc> = vec![];
        let corpus = LexicalCorpus::Raw(&docs);
        assert!(Bm25Ranker.score(Some("token"), &corpus, &[]).is_empty());
    }

    #[test]
    fn bm25_survivor_untouched_by_query_is_zero() {
        let docs = vec![doc("a", "token auth"), doc("b", "python django")];
        let corpus = LexicalCorpus::Raw(&docs);
        let got = Bm25Ranker.score(Some("token"), &corpus, &["a", "b"]);
        assert!(got[0].1 > 0, "matched target must be nonzero");
        assert_eq!(got[1].1, 0, "survivor no query term touches ⇒ 0");
    }

    // df over the full corpus: non-target "common" docs inflate df(common),
    // depressing its IDF below rare's — even though those docs are not targets.
    #[test]
    fn bm25_df_reflects_full_corpus_not_targets() {
        let docs = vec![
            doc("a", "common"),
            doc("b", "rare"),
            doc("x", "common"),
            doc("y", "common"),
            doc("z", "common"),
        ];
        let corpus = LexicalCorpus::Raw(&docs);
        let got = Bm25Ranker.score(Some("common rare"), &corpus, &["a", "b"]);
        assert!(
            got[1].1 > got[0].1,
            "df over full corpus must depress common: {got:?}"
        );
    }

    // VT-5 — avgdl equivalence on the REAL `lexical::tokenize` (carries PHASE-01
    // EX-3 to the production lexer): the builder's fitted avgdl equals the
    // independent multiset mean of token-output length over the corpus (A3).
    #[test]
    fn bm25_avgdl_equals_multiset_mean_on_real_tokenizer() {
        let texts = ["mem.auth.token", "src/x.rs ok", "single"];
        let reference_mean = {
            let total: usize = texts.iter().map(|t| tokenize(t).len()).sum();
            total as f32 / texts.len() as f32
        };
        let embedder =
            EmbedderBuilder::<Space, _>::with_tokenizer_and_fit_to_corpus(LexTokenizer, &texts)
                .build();
        assert_eq!(
            embedder.avgdl(),
            reference_mean,
            "self-computed avgdl must equal multiset mean over real tokenize (A3, VT-5)"
        );
    }
}