Skip to main content

difflore_core/context/retrieval/
rules.rs

1use crate::context::DEFAULT_TOP_K_RULES;
2use crate::context::ann;
3use crate::context::embedding::cosine_similarity;
4use crate::context::index_db::{self, IndexedRuleChunk, QueryFilter};
5use crate::domain::glob_match::{GlobErrorPolicy, glob_match};
6use crate::errors::CoreError;
7use crate::review_trajectory::{TrajectoryBuilder, TrajectoryStep};
8use sqlx::SqlitePool;
9use std::collections::{HashMap, HashSet};
10use std::time::Duration;
11
12use super::query_embed::embed_query_aligned_to_index;
13use super::scoring::{directive_intent_aligned, effective_confidence, infer_rule_kind};
14use super::{
15    ADAPTIVE_INJECT_THRESHOLD, EXPLICIT_RECALL_MIN_RELEVANCE, EXPLICIT_RECALL_RELATIVE_FLOOR,
16    INTENT_ALIGNMENT_EXEMPT_SCORE, MIN_RELEVANCE_SCORE, RELATIVE_RELEVANCE_FLOOR, RRF_K,
17    ScoredRuleChunk, concreteness_score, lexical_terms,
18};
19
20const MAX_RULE_RETRIEVAL_TOP_K: usize = 50;
21const MAX_ANN_CANDIDATES: usize = 150;
22
23/// Retrieve rules with confidence-weighted ranking.
24/// Final score = hybrid rank score with one final confidence tie-breaker.
25/// Rules with confidence < 0.2 are excluded (likely rejected).
26pub async fn retrieve_rules(
27    index_pool: &SqlitePool,
28    query: &str,
29    top_k: Option<usize>,
30) -> Result<Vec<ScoredRuleChunk>, CoreError> {
31    retrieve_rules_with_confidence(
32        index_pool,
33        query,
34        RetrievalOptions {
35            top_k,
36            ..Default::default()
37        },
38    )
39    .await
40}
41
42/// Decide whether a chunk's `file_patterns` (JSON-encoded glob list) match
43/// the given target file path. Returns `true` if patterns are absent / empty
44/// (universal rule), or if any glob matches. Malformed JSON or an unbuildable
45/// glob set are treated as a match — never silently drop a rule because of a
46/// parse error (over-recall is correct for retrieval). Iter-9 (2026-04-18)
47/// port of cloud `patternAllows`; B8 (shared `glob_match`).
48pub(super) fn pattern_allows(file_patterns_json: Option<&str>, target_file: &str) -> bool {
49    glob_match(file_patterns_json, target_file, GlobErrorPolicy::OverRecall)
50}
51
52/// Retrieve rules with confidence weighting plus hybrid FTS / embedding retrieval.
53///
54/// Retrieval options for confidence weighting, scoping, ANN usage, and
55/// trajectory telemetry. The default path matches plain retrieval: default
56/// top-k, no confidence map, no target-file cascade, no SQL metadata filter,
57/// ANN enabled, and no trajectory capture.
58pub struct RetrievalOptions<'a> {
59    pub top_k: Option<usize>,
60    pub confidence_map: Option<&'a HashMap<String, f64>>,
61    /// Optional allow-list applied before RRF fusion. Callers that already
62    /// know the engine-eligible rule set can pass it here so disabled rules
63    /// do not consume top-k score budget.
64    pub eligible_skill_ids: Option<&'a HashSet<String>>,
65    /// Iter-13 (2026-05-02). Per-skill age in days, used by the
66    /// category-keyed half-life decay in `effective_confidence`. When
67    /// `None` (or a chunk's `skill_id` is absent from the map) the
68    /// scoring site uses `age_days = 0.0` — identical to the
69    /// pre-plumbing behaviour, so no caller breaks if it doesn't pass
70    /// a map.
71    pub age_days_map: Option<&'a HashMap<String, f32>>,
72    pub target_file: Option<&'a str>,
73    pub filter: Option<&'a QueryFilter>,
74    pub ann_enabled: bool,
75    /// Optional provider-call budget for embedding the query. Latency-
76    /// sensitive hook paths set this so a slow cloud embedder degrades to
77    /// lexical retrieval instead of timing out the host agent's hook.
78    pub embedding_timeout: Option<Duration>,
79    /// When true, a query embed that falls back to lexical because the base
80    /// budget timed out on a healthy cloud lane is retried once with a longer
81    /// cold-absorbing budget (see [`COLD_RETRY_EMBEDDING_TIMEOUT`]). Only the
82    /// human-waiting CLI `recall`/`search` path sets this; the latency-critical
83    /// hook/MCP paths leave it `false` so a cold provider never blocks the agent.
84    pub cold_start_retry: bool,
85    /// When true, suppresses broad weak matches entirely. This is useful
86    /// for unsolicited hook injection where "no extra context" is often
87    /// better than five noisy rules. Explicit user/tool queries should
88    /// leave this false so a search never looks broken just because the
89    /// best match is weak.
90    pub adaptive_prune: bool,
91    pub trajectory: Option<&'a mut TrajectoryBuilder>,
92}
93
94impl Default for RetrievalOptions<'_> {
95    fn default() -> Self {
96        Self {
97            top_k: None,
98            confidence_map: None,
99            eligible_skill_ids: None,
100            age_days_map: None,
101            target_file: None,
102            filter: None,
103            ann_enabled: true,
104            embedding_timeout: None,
105            cold_start_retry: false,
106            adaptive_prune: false,
107            trajectory: None,
108        }
109    }
110}
111
112/// Retrieve rules with confidence weighting plus hybrid FTS / embedding retrieval.
113///
114/// `confidence_map` maps `skill_id` -> `confidence_score`. If None, all rules
115/// get default confidence 0.7.
116///
117/// `target_file`: when present, applies **strict cascade** — chunks whose
118/// `file_patterns` don't match the target file are dropped before scoring.
119/// When the matched bucket is empty, returns no pattern-scoped rules rather
120/// than widening into rules explicitly tagged for other files.
121///
122/// `filter`: metadata pre-filter applied at SQL time (C2). Empty filter
123/// means "no scoping" — retrieval sees every chunk.
124///
125/// `trajectory`: optional builder that captures RRF / filter statistics
126/// for the cloud dashboard. Passing `None` disables telemetry.
127pub async fn retrieve_rules_with_confidence(
128    index_pool: &SqlitePool,
129    query: &str,
130    options: RetrievalOptions<'_>,
131) -> Result<Vec<ScoredRuleChunk>, CoreError> {
132    let RetrievalOptions {
133        top_k,
134        confidence_map,
135        eligible_skill_ids,
136        age_days_map,
137        target_file,
138        filter,
139        ann_enabled,
140        embedding_timeout,
141        cold_start_retry,
142        adaptive_prune,
143        trajectory,
144    } = options;
145    let default_filter = QueryFilter::default();
146    let filter = filter.unwrap_or(&default_filter);
147    let requested_k = top_k.unwrap_or(DEFAULT_TOP_K_RULES);
148    if requested_k == 0 {
149        return Ok(Vec::new());
150    }
151    let k = requested_k.min(MAX_RULE_RETRIEVAL_TOP_K);
152    let retrieval_start = std::time::Instant::now();
153    let embedded_query =
154        embed_query_aligned_to_index(index_pool, query, embedding_timeout, cold_start_retry).await;
155    let query_emb = embedded_query.vector;
156
157    // Switch the RRF weighting when the actual query vector is only the
158    // local lexical hash, or when a provider failure disabled the vector
159    // lane entirely.
160    //
161    // 2026-05-03 A/B verified the hybrid (local hash + FTS5 BM25) lifts
162    // self-recall@5 from 45% (FTS-only) to 85%, and @1 from 10% to 45%.
163    // The local hash isn't semantic but its bag-of-token overlap fills
164    // FTS5's strict-tokenizer gap. Worth keeping until cloud-managed
165    // embedding is configured.
166    let is_semantic = embedded_query.semantic;
167
168    // ── C2: SQL-level metadata pre-filter ──────────────────────────
169    // When the filter is empty this reduces to `SELECT *`, matching the
170    // pre-C2 behaviour and so zero-cost for unscoped callers.
171    let unfiltered_count: u32 = if filter.is_empty() {
172        0
173    } else {
174        sqlx::query_scalar!(r#"SELECT COUNT(*) as "n!: i64" FROM rule_chunks"#)
175            .fetch_one(index_pool)
176            .await
177            .unwrap_or(0)
178            .try_into()
179            .unwrap_or(u32::MAX)
180    };
181    let chunks = index_db::query_rule_chunks(index_pool, filter).await?;
182    let after_count: u32 = u32::try_from(chunks.len()).unwrap_or(u32::MAX);
183
184    // ── C4: FTS5 keyword baseline ──────────────────────────────────
185    // Pull `k*4` raw hits so we have RRF material even after the
186    // pattern cascade trims some out.
187    let fts_limit = k.saturating_mul(4).min(200).max(k);
188    let fts_hits = index_db::fts_search(index_pool, query, filter, fts_limit)
189        .await
190        .unwrap_or_default();
191
192    let default_confidence = 0.7;
193    let min_confidence = 0.2;
194
195    // Pre-partition by file-pattern match if target_file is set. This is the
196    // strict cascade: when ANY chunk matches the target, drop the rest.
197    let matched: Vec<&IndexedRuleChunk> = if let Some(tf) = target_file {
198        chunks
199            .iter()
200            .filter(|c| pattern_allows(c.file_patterns.as_deref(), tf))
201            .collect()
202    } else {
203        chunks.iter().collect()
204    };
205    let active: &[&IndexedRuleChunk] = if target_file.is_some() && matched.is_empty() {
206        &[]
207    } else {
208        &matched
209    };
210
211    // Build a lookup table so FTS hits (identified by chunk id) can be
212    // reconciled against the cascade-filtered active set.
213    let id_to_chunk: HashMap<&str, &IndexedRuleChunk> =
214        active.iter().map(|c| (c.id.as_str(), *c)).collect();
215
216    // ── Embedding-ranked candidate list ───────────────────────────
217    //
218    // Try the HNSW ANN path first. It returns a small candidate set that is
219    // intersected with the metadata-filtered `active` set. On any failure, fall
220    // back to the linear cosine scan.
221    let ann_candidates = k.saturating_mul(3).min(MAX_ANN_CANDIDATES).max(k);
222    let ann_result = if ann_enabled {
223        try_ann_rank(
224            &query_emb,
225            ann_candidates,
226            &id_to_chunk,
227            confidence_map,
228            eligible_skill_ids,
229            default_confidence,
230            min_confidence,
231        )
232        .await
233    } else {
234        None
235    };
236
237    let (mut emb_ranked, ann_used, ann_index_size, ann_returned): (
238        Vec<(&IndexedRuleChunk, f64)>,
239        bool,
240        u32,
241        u32,
242    ) = if let Some((ranked, idx_size, returned)) = ann_result {
243        (ranked, true, idx_size, returned)
244    } else {
245        let fallback: Vec<(&IndexedRuleChunk, f64)> = active
246            .iter()
247            .filter_map(|c: &&IndexedRuleChunk| {
248                if !eligible_skill_ids.is_none_or(|ids| ids.contains(&c.skill_id)) {
249                    return None;
250                }
251                let confidence = confidence_map
252                    .and_then(|m| m.get(&c.skill_id).copied())
253                    .unwrap_or(default_confidence);
254                if confidence < min_confidence {
255                    return None;
256                }
257                if query_emb.len() != c.embedding.len() {
258                    return None;
259                }
260                let sim = cosine_similarity(&query_emb, &c.embedding);
261                Some((*c, f64::from(sim)))
262            })
263            .collect();
264        (fallback, false, 0, 0)
265    };
266    emb_ranked.sort_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.id.cmp(&b.0.id)));
267
268    let emb_rank_map: HashMap<&str, usize> = emb_ranked
269        .iter()
270        .enumerate()
271        .map(|(i, (c, _))| (c.id.as_str(), i))
272        .collect();
273
274    // ── FTS rank map (only keeps hits that survived the cascade). ──
275    let mut fts_rank_map: HashMap<&str, usize> = HashMap::new();
276    let mut fts_kept = 0u32;
277    for (i, (id, _)) in fts_hits.iter().enumerate() {
278        if id_to_chunk.contains_key(id.as_str()) {
279            fts_rank_map.insert(id.as_str(), i);
280            fts_kept += 1;
281        }
282    }
283
284    // Overlap metric for telemetry — how many ids were ranked by BOTH
285    // paths. High overlap → paths agree; low overlap → they're
286    // surfacing complementary results (the whole point of hybrid).
287    let overlap: u32 = {
288        let fts_ids: HashSet<&str> = fts_rank_map.keys().copied().collect();
289        let emb_ids: HashSet<&str> = emb_rank_map.keys().copied().collect();
290        u32::try_from(fts_ids.intersection(&emb_ids).count()).unwrap_or(u32::MAX)
291    };
292
293    // ── RRF fusion ────────────────────────────────────────────────
294    //
295    //    score(chunk) = w_emb * 1/(k+rank_emb) + w_fts * 1/(k+rank_fts)
296    //
297    // When the embedder is not semantic, skew toward the FTS baseline because
298    // local SHA1 is noise-dominated.
299    let (w_emb, w_fts) = if is_semantic { (0.5, 0.5) } else { (0.2, 0.8) };
300
301    let mut fused: HashMap<&str, (f64, &IndexedRuleChunk, f64 /*confidence*/)> = HashMap::new();
302    // `_sim` is unused directly in RRF (ranks already encode it). The
303    // raw score is kept in the vector only to sort embedding candidates
304    // before assigning reciprocal ranks.
305    for (chunk, _sim) in &emb_ranked {
306        let rank = emb_rank_map.get(chunk.id.as_str()).copied().unwrap_or(0);
307        let contrib = w_emb / (RRF_K + rank as f64 + 1.0);
308        let confidence = confidence_map
309            .and_then(|m| m.get(&chunk.skill_id).copied())
310            .unwrap_or(default_confidence);
311        fused
312            .entry(chunk.id.as_str())
313            .and_modify(|e| e.0 += contrib)
314            .or_insert((contrib, *chunk, confidence));
315    }
316    for (id, rank) in &fts_rank_map {
317        if let Some(chunk) = id_to_chunk.get(id) {
318            if !eligible_skill_ids.is_none_or(|ids| ids.contains(&chunk.skill_id)) {
319                continue;
320            }
321            let contrib = w_fts / (RRF_K + *rank as f64 + 1.0);
322            let confidence = confidence_map
323                .and_then(|m| m.get(&chunk.skill_id).copied())
324                .unwrap_or(default_confidence);
325            if confidence < min_confidence {
326                continue;
327            }
328            fused
329                .entry(id)
330                .and_modify(|e| e.0 += contrib)
331                .or_insert((contrib, *chunk, confidence));
332        }
333    }
334
335    // ── Emit trajectory telemetry (best-effort, never blocks recall) ──
336    if let Some(t) = trajectory {
337        if !filter.is_empty() {
338            t.push(TrajectoryStep::RetrievalFilter {
339                before: unfiltered_count,
340                after: after_count,
341            });
342        }
343        t.push(TrajectoryStep::AnnRecall {
344            used: ann_used,
345            index_size: ann_index_size,
346            candidates: ann_returned,
347        });
348        t.push(TrajectoryStep::HybridFusion {
349            fts_hits: fts_kept,
350            emb_hits: u32::try_from(emb_ranked.len()).unwrap_or(u32::MAX),
351            overlap,
352        });
353    }
354
355    // Materialise the final scored list. `ScoredRuleChunk.score` is
356    // the fused RRF score multiplied by a *small* confidence multiplier
357    // — confidence acts as a tie-breaker rather than the primary
358    // ranking signal. The earlier `sqrt(confidence)` weight flipped
359    // the ordering on real workloads: a freshly captured (conf=0.6)
360    // conversation rule with a strong file-pattern + lexical match was
361    // demoted below cloud-extracted rules (conf=0.7) whose query
362    // overlap was 5-20% lower. Net result: the rule the user just
363    // taught DiffLore was the LAST rule injected for the very file it
364    // applies to. That breaks the slogan ("AI understands your preferences better and better") at
365    // exactly the moment users will check whether the slogan is true.
366    //
367    // The 0.9 + 0.1 * confidence multiplier keeps spread at 8%
368    // (conf=0.2 floor → 0.92; conf=1.0 → 1.0). RRF score gaps between
369    // adjacent ranks in our regime are 5-20%, so confidence can break
370    // a near-tie but cannot overturn a clear lexical/semantic winner.
371    // Strengthening (+0.05 confidence per accept) still earns +0.5%
372    // multiplier — enough to win against an equally-relevant peer at
373    // a lower confidence, which is what "the rule I've ratified twice
374    // outranks the rule captured once" should feel like.
375    let mut scored: Vec<ScoredRuleChunk> = fused
376        .into_values()
377        .map(|(score, chunk, confidence)| {
378            // Confidence tie-breaker (8% spread) + content-concreteness
379            // boost. Iter-12 (2026-04-25) added the concreteness factor
380            // because rule-impact-by-kind audit showed slogan rules
381            // ("Trust CI for workflow correctness", "Hold clean PRs for
382            // additional review") were misfiring across languages —
383            // they have no concrete code tokens to anchor relevance to.
384            // The concreteness signal counts backticked tokens + path-
385            // like fragments + version literals in the rule's content,
386            // saturated at 6 hits to avoid runaway when a rule body is
387            // mostly code. Net: a singleton rule citing
388            // `useQuery({...})` outranks a generic slogan with the
389            // same lexical match, fixing the Python −0.38 over-engineer
390            // regime we measured in iter 9.6.
391            // Iter-13 (2026-05-02). Borrow jcode's category-keyed half-life
392            // so an ancient style rule no longer outranks a freshly ratified
393            // correction on conf alone. Kind is inferred from chunk content
394            // (no `kind` column on rule_chunks); age_days comes from the
395            // optional per-call `age_days_map` (None ⇒ 0.0 ⇒ no decay,
396            // matching the original behaviour for callers that haven't
397            // wired the map yet).
398            let kind = infer_rule_kind(&chunk.content);
399            let age_days = age_days_map
400                .and_then(|m| m.get(&chunk.skill_id).copied())
401                .unwrap_or(0.0);
402            let eff_conf = f64::from(effective_confidence(confidence as f32, &kind, age_days));
403            let conf_weight = 0.1f64.mul_add(eff_conf.clamp(0.0, 1.0), 0.9);
404            let conc = concreteness_score(&chunk.content);
405            // Each concreteness "point" adds 5% to score, capped at +30%.
406            let conc_weight = 0.05f64.mul_add(conc.min(6) as f64, 1.0);
407            ScoredRuleChunk {
408                skill_id: chunk.skill_id.clone(),
409                content: chunk.content.clone(),
410                score: score * conf_weight * conc_weight,
411                confidence,
412            }
413        })
414        .collect();
415    scored.sort_by(|a, b| {
416        b.score
417            .total_cmp(&a.score)
418            .then_with(|| a.skill_id.cmp(&b.skill_id))
419    });
420
421    // Adaptive top-K + noise floor.
422    //
423    // Iter-12 hardens the "less is more" principle on top of iter-4's
424    // floors. The fastapi/Python regression (-0.38 ΔB-A in iter 9.6)
425    // traced to the agent receiving 5 weak rules on simple tasks (typo
426    // fix, parameter substitution) where claude's training already
427    // nailed the answer. Five weak rules induced over-engineering. The
428    // fix: when the top result's score itself is in the noise band,
429    // emit ZERO rules — let the agent trust its training.
430    //
431    // Adaptive zero-inject is **only safe for unsolicited
432    // injection** (PreToolUse:Read hook). Explicit user queries via
433    // Explicit canonical MCP rule-search calls must always
434    // return what's available — when a user types `search_rules
435    // intent=...`, returning empty would feel broken even if scores
436    // are weak. Callers opt in by setting `top_k=Some(5)` AND wanting
437    // adaptive behaviour explicitly via the iter-12 hook contract.
438    //
439    // The rule of thumb: if only the absolute floor would have kept
440    // ≥3 results in scope (i.e. there's a real "noise tail" worth
441    // pruning), apply adaptive. Tiny corpora with 1-2 candidates
442    // bypass adaptive — those results are fine to return as-is.
443    // Adaptive zero-inject only when we'd otherwise return many weak
444    // matches (the "5 weak rules" pathology). Small corpora and small
445    // result sets bypass — those are explicit user queries with
446    // limited candidates anyway.
447    let adaptive_eligible = adaptive_prune && scored.len() >= 5;
448    if let Some(top_score) = scored.first().map(|s| s.score) {
449        if adaptive_eligible && top_score < ADAPTIVE_INJECT_THRESHOLD {
450            // Top match is itself weak AND we have many results — this
451            // is the "5 weak rules" pathology. Return empty.
452            scored.clear();
453        } else {
454            prune_below_floors(&mut scored, top_score);
455
456            // Adaptive K: when many results cluster within 60% of the
457            // top, agent can't tell signal from noise — return just
458            // the clearly-strong ones. Skip when result set is tiny
459            // (already informative).
460            if adaptive_eligible {
461                let strong_floor = top_score * 0.60;
462                let strong_count = scored
463                    .iter()
464                    .take_while(|s| s.score >= strong_floor)
465                    .count();
466                if strong_count > 0 && strong_count < scored.len() {
467                    scored.truncate(strong_count.min(k));
468                }
469            }
470        }
471    }
472
473    scored.truncate(k);
474
475    // Memory-pipeline event: surfaces the ANN/embedding pass to the TUI
476    // Activity tab so users can see retrieval running. Best-effort —
477    // never blocks recall.
478    crate::activity_stream::record(
479        crate::activity_stream::ActivityPayload::RetrievalEmbedding {
480            hits: u32::try_from(scored.len()).unwrap_or(u32::MAX),
481            took_ms: u64::try_from(retrieval_start.elapsed().as_millis()).unwrap_or(u64::MAX),
482        },
483    );
484
485    Ok(scored)
486}
487
488/// Drop the RRF noise tail from an already-sorted (descending) scored
489/// list using the two floors that have always guarded retrieval: the
490/// absolute [`MIN_RELEVANCE_SCORE`] (RRF rounding noise / cascade-only
491/// admits) AND the relative [`RELATIVE_RELEVANCE_FLOOR`] fraction of the
492/// top hit (the "everything scored 0.02" flat-distribution pathology).
493///
494/// Factored out of `retrieve_rules_with_confidence` so the same retain
495/// is shared with the explicit-recall gate below and is unit-testable in
496/// isolation. Pure: mutates `scored` in place, never re-sorts (the caller
497/// has already sorted), so `top_score` must be the current leader's score.
498fn prune_below_floors(scored: &mut Vec<ScoredRuleChunk>, top_score: f64) {
499    let relative_floor = top_score * RELATIVE_RELEVANCE_FLOOR;
500    scored.retain(|s| s.score > MIN_RELEVANCE_SCORE && s.score >= relative_floor);
501}
502
503/// Adaptive relevance gate for the EXPLICIT recall surfaces — the MCP
504/// `search_rules` tool and the CLI `recall` command. Mirrors the hook
505/// path's adaptive pruning so an agent never has to weigh five weak rules
506/// against an empty answer: irrelevant memory is worse than none.
507///
508/// The hook path (`adaptive_prune == true` inside
509/// `retrieve_rules_with_confidence`) zero-injects on a weak top hit and
510/// drops the noise tail *before* any downstream reranking. The explicit
511/// paths can't do that in-retrieval because they still add high-value
512/// signals after fusion — exact-title-strict matches (score `2.0 + conf`),
513/// the cross-repo starter set, and the lexical-intent re-rank boost — so
514/// this gate runs on the FINAL, fully-reranked, sorted list instead. The
515/// net contract is the same as the hook's: a low-relevance query
516/// (wrong-file, no intent overlap — e.g. a Codecov rule surfacing in a
517/// wrong-file top-3) collapses to ZERO results so the caller emits its
518/// existing "no relevant memory" message rather than confident filler.
519///
520/// Two conservative gates, tuned so genuinely-strong matches are NEVER
521/// suppressed:
522///   1. Absolute floor — if even the top hit is below
523///      [`EXPLICIT_RECALL_MIN_RELEVANCE`], every result is noise: clear.
524///      After the lexical-intent re-rank a genuinely relevant top hit
525///      sits far above this floor (boosted into the 0.1+ range), while a
526///      cascade-only / no-overlap top hit stays in the raw RRF band
527///      (~0.001–0.005) and is correctly dropped.
528///   2. Relative floor — drop tail results below
529///      [`EXPLICIT_RECALL_RELATIVE_FLOOR`] of the (surviving) top hit, so
530///      a strong leader doesn't drag along far-weaker filler. Deliberately
531///      looser than the hook's [`RELATIVE_RELEVANCE_FLOOR`]: explicit
532///      queries should keep more of a real result set, only shedding the
533///      clearly-irrelevant tail.
534///
535/// Pure and in-place. The caller must pass a list already sorted
536/// descending by `score` (both explicit call sites do, via their final
537/// re-rank). Strong matches (including exact-title-strict and starter
538/// hits) clear both floors by a wide margin, so this never regresses a
539/// real recall.
540pub fn apply_explicit_recall_threshold(scored: &mut Vec<ScoredRuleChunk>) {
541    let Some(top_score) = scored.first().map(|s| s.score) else {
542        return;
543    };
544    // Absolute floor: the best match itself is noise → return nothing.
545    if top_score < EXPLICIT_RECALL_MIN_RELEVANCE {
546        scored.clear();
547        return;
548    }
549    // Relative floor: shed the tail far below the leader.
550    let relative_floor = top_score * EXPLICIT_RECALL_RELATIVE_FLOOR;
551    scored.retain(|s| s.score >= relative_floor);
552}
553
554/// Intent-alignment gate for the EXPLICIT recall surfaces — applied BEFORE
555/// [`apply_explicit_recall_threshold`] on the final, fully-reranked list.
556///
557/// WHY: topically adjacent rules can clear relevance floors while addressing a
558/// different action or subject than the directive. This gate adds the missing
559/// axis: does the rule's directive match the query intent, not just its topic?
560///
561/// Behaviour, biased hard toward FEWER / zero (DiffLore's "stay silent
562/// unless it clearly applies" positioning):
563///   * An all-weak query (no salient terms after stop-word filtering) cannot
564///     establish intent for ANY rule → clear. Returning nothing is correct
565///     here: we have no signal to claim a match.
566///   * A candidate is KEPT when it is either strongly scored (≥
567///     [`INTENT_ALIGNMENT_EXEMPT_SCORE`] — exact-title-strict / starter /
568///     strongly lexically-boosted hits, already intent-validated upstream)
569///     or its directive is intent-aligned per [`directive_intent_aligned`].
570///   * Every other candidate — the topically-adjacent middle band — is
571///     dropped.
572///
573/// Conservative by construction: the strong-score exemption guarantees no
574/// genuinely-strong match (and therefore no eval self-recall hit, where the
575/// query is the rule's own intent text and overlap is near-total) is ever
576/// suppressed. Pure / in-place; order is preserved (the caller has already
577/// sorted, and this only `retain`s).
578pub fn apply_intent_alignment_gate(scored: &mut Vec<ScoredRuleChunk>, intent: &str) {
579    if scored.is_empty() {
580        return;
581    }
582    let query_terms = lexical_terms(intent);
583    if query_terms.is_empty() {
584        // No salient intent to align against — per the "fewer / zero"
585        // bias, an unscorable intent yields no confident matches.
586        scored.clear();
587        return;
588    }
589    scored.retain(|chunk| {
590        chunk.score >= INTENT_ALIGNMENT_EXEMPT_SCORE
591            || directive_intent_aligned(&chunk.content, &query_terms)
592    });
593}
594
595/// Attempt the HNSW ANN ranking path for the current project.
596///
597/// Returns `Some((ranked, index_size, returned))` on a successful ANN
598/// lookup that produced at least one candidate inside the
599/// metadata-filtered `active` set. Returns `None` on any of:
600/// - empty / missing on-disk index
601/// - dim mismatch between query and stored vectors
602/// - ANN search yielded zero usable candidates (e.g. all hits were
603///   tombstoned or outside the active filter)
604/// - any internal error talking to the ANN cache
605///
606/// The caller MUST treat `None` as "use the linear cosine scan". This
607/// is the safety net that guarantees retrieval keeps working when the
608/// HNSW index is absent or stale.
609async fn try_ann_rank<'a>(
610    query_emb: &[f32],
611    candidates: usize,
612    id_to_chunk: &HashMap<&'a str, &'a IndexedRuleChunk>,
613    confidence_map: Option<&HashMap<String, f64>>,
614    eligible_skill_ids: Option<&HashSet<String>>,
615    default_confidence: f64,
616    min_confidence: f64,
617) -> Option<(Vec<(&'a IndexedRuleChunk, f64)>, u32, u32)> {
618    if query_emb.is_empty() || candidates == 0 {
619        return None;
620    }
621    // Resolve the project hash from the current working directory. The
622    // ANN cache is keyed on this hash so MCP calls running in the same
623    // project share one graph across calls. Retrieval call sites that
624    // run outside a project root (unit tests in a tempdir) will still
625    // get a valid hash — they just won't have a persisted graph to
626    // reload, which is fine: `load_or_empty` returns an empty index and
627    // we fall through to the linear scan.
628    let project_root = crate::db::current_project_root();
629    let project_hash = crate::db::project_hash_from_root(&project_root);
630
631    let ann_arc = ann::get_ann_for_project(&project_hash, query_emb.len())
632        .await
633        .ok()?;
634    let ann_guard = ann_arc.lock().await;
635    let index_size = ann_guard.live_size();
636    if index_size == 0 {
637        return None;
638    }
639    let hits = ann_guard.search(query_emb, candidates);
640    if hits.is_empty() {
641        return None;
642    }
643    let returned = u32::try_from(hits.len()).unwrap_or(u32::MAX);
644
645    // Translate the ANN hit set back into `&IndexedRuleChunk` + RRF
646    // score. The score we carry is raw cosine similarity so confidence
647    // is applied at exactly one ranking site (the final tie-breaker).
648    // DistCosine returns `1 - cos`, so cosine similarity is `1 - distance`.
649    let mut ranked: Vec<(&IndexedRuleChunk, f64)> = Vec::with_capacity(hits.len());
650    for (chunk_id, distance) in hits {
651        let Some(chunk) = id_to_chunk.get(chunk_id.as_str()) else {
652            // Hit lives in the graph but didn't survive the metadata
653            // pre-filter — drop it.
654            continue;
655        };
656        if !eligible_skill_ids.is_none_or(|ids| ids.contains(&chunk.skill_id)) {
657            continue;
658        }
659        let confidence = confidence_map
660            .and_then(|m| m.get(&chunk.skill_id).copied())
661            .unwrap_or(default_confidence);
662        if confidence < min_confidence {
663            continue;
664        }
665        let sim = (1.0 - f64::from(distance)).max(0.0);
666        ranked.push((*chunk, sim));
667    }
668    if ranked.is_empty() {
669        // ANN surfaced hits but none survived the filter — treat as a
670        // miss so the linear scan can try to find something.
671        return None;
672    }
673    Some((ranked, index_size, returned))
674}
675
676#[cfg(test)]
677mod tests {
678    use super::super::MIN_INTENT_DIRECTIVE_OVERLAP;
679    use super::*;
680
681    fn chunk(id: &str, score: f64) -> ScoredRuleChunk {
682        ScoredRuleChunk {
683            skill_id: id.to_owned(),
684            content: format!("Rule ID: {id}\nRule Name: {id}\n\nbody"),
685            score,
686            confidence: 0.7,
687        }
688    }
689
690    #[test]
691    fn explicit_recall_threshold_strong_top_hit_survives() {
692        // A genuinely strong match (lexically boosted into the 0.1+ band)
693        // must always survive — the gate is conservative and never
694        // suppresses real recall.
695        let mut scored = vec![chunk("strong", 0.30), chunk("supporting", 0.12)];
696        apply_explicit_recall_threshold(&mut scored);
697        assert_eq!(scored.len(), 2, "strong matches must not be pruned");
698        assert_eq!(scored[0].skill_id, "strong");
699    }
700
701    #[test]
702    fn explicit_recall_threshold_all_weak_returns_empty() {
703        // Wrong-file / low-relevance query: even the top hit is in the raw
704        // fused RRF noise band, so the whole set is filler and should return
705        // zero results.
706        let mut scored = vec![
707            chunk("noise-1", 0.004),
708            chunk("noise-2", 0.003),
709            chunk("noise-3", 0.002),
710            chunk("noise-4", 0.0015),
711            chunk("noise-5", 0.001),
712        ];
713        apply_explicit_recall_threshold(&mut scored);
714        assert!(
715            scored.is_empty(),
716            "a query whose only matches are weak must return zero results"
717        );
718    }
719
720    #[test]
721    fn explicit_recall_threshold_borderline_keeps_only_strong() {
722        // Borderline set: one clear leader well above the absolute floor,
723        // plus tail rules far below it. The leader (and anything within the
724        // relative band) survives; the far-below-leader tail is dropped.
725        let mut scored = vec![
726            chunk("leader", 0.40),
727            chunk("near", 0.10), // 25% of leader — within the 0.20 relative floor
728            chunk("tail-1", 0.05), // 12.5% of leader — dropped
729            chunk("tail-2", 0.02),
730            chunk("tail-3", 0.011),
731        ];
732        apply_explicit_recall_threshold(&mut scored);
733        let ids: Vec<&str> = scored.iter().map(|s| s.skill_id.as_str()).collect();
734        assert_eq!(
735            ids,
736            vec!["leader", "near"],
737            "only the leader and rules within the relative band survive"
738        );
739    }
740
741    #[test]
742    fn explicit_recall_threshold_top_hit_at_absolute_floor_is_kept() {
743        // A top hit exactly at the absolute floor is NOT below it, so it
744        // survives — proving the gate suppresses only genuine sub-floor
745        // noise, never a borderline-but-present match.
746        let mut scored = vec![chunk("at-floor", EXPLICIT_RECALL_MIN_RELEVANCE)];
747        apply_explicit_recall_threshold(&mut scored);
748        assert_eq!(scored.len(), 1, "top hit at the floor must be kept");
749    }
750
751    #[test]
752    fn explicit_recall_threshold_empty_input_is_noop() {
753        let mut scored: Vec<ScoredRuleChunk> = Vec::new();
754        apply_explicit_recall_threshold(&mut scored);
755        assert!(scored.is_empty());
756    }
757
758    // -- Intent-alignment gate tests (precision fix) --
759
760    /// Build a candidate whose distilled directive is its `Rule Name:` title.
761    /// `score` is left in the moderate (gated) band by default so the gate's
762    /// alignment check — not the strong-score exemption — decides its fate.
763    fn directive_chunk(id: &str, title: &str, score: f64) -> ScoredRuleChunk {
764        ScoredRuleChunk {
765            skill_id: id.to_owned(),
766            content: format!(
767                "Rule ID: {id}\nRule Name: {title}\nType: convention\nTags: \n\n{title}."
768            ),
769            score,
770            confidence: 0.7,
771        }
772    }
773
774    #[test]
775    fn intent_gate_drops_topically_adjacent_different_subject_rule() {
776        // The diagnosed failure: a "return false vs panic" directive recalls a
777        // panic-MESSAGE-wording rule and a test-timing rule. Both share the
778        // file area / topical anchor ("panic"/"test") but address a DIFFERENT
779        // action+subject than the query, so the agent gets distracted. Each is
780        // dropped because its directive shares <2 of the query's salient terms
781        // (and <half of them).
782        let mut scored = vec![
783            directive_chunk(
784                "panic-message-wording",
785                "Panic messages should describe the violated invariant",
786                0.12,
787            ),
788            directive_chunk(
789                "test-timing",
790                "Avoid sleep-based waits in tests; poll for the condition",
791                0.10,
792            ),
793        ];
794        apply_intent_alignment_gate(
795            &mut scored,
796            "return false instead of panic on invalid input",
797        );
798        assert!(
799            scored.is_empty(),
800            "topically-adjacent, wrong-subject rules must be dropped, kept: {:?}",
801            scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
802        );
803    }
804
805    #[test]
806    fn intent_gate_keeps_directly_on_subject_rule() {
807        // The on-subject rule shares the action verb AND its object
808        // (return + false + panic + input), clearing the absolute-overlap
809        // bar, so it survives even at a moderate (non-exempt) score.
810        let mut scored = vec![directive_chunk(
811            "return-false-not-panic",
812            "Return false rather than panic on invalid input",
813            0.12,
814        )];
815        apply_intent_alignment_gate(
816            &mut scored,
817            "return false instead of panic on invalid input",
818        );
819        assert_eq!(
820            scored
821                .iter()
822                .map(|s| s.skill_id.as_str())
823                .collect::<Vec<_>>(),
824            vec!["return-false-not-panic"],
825            "a directly-on-subject directive must survive the intent gate"
826        );
827    }
828
829    #[test]
830    fn intent_gate_keeps_on_subject_drops_adjacent_in_same_set() {
831        // The realistic mixed set the A/B saw: the on-subject rule plus the two
832        // topically-adjacent distractors, all admitted by hybrid retrieval.
833        // The gate keeps only the aligned one.
834        let mut scored = vec![
835            directive_chunk(
836                "return-false-not-panic",
837                "Return false rather than panic on invalid input",
838                0.12,
839            ),
840            directive_chunk(
841                "panic-message-wording",
842                "Panic messages should describe the violated invariant",
843                0.11,
844            ),
845            directive_chunk(
846                "test-timing",
847                "Avoid sleep-based waits in tests; poll for the condition",
848                0.10,
849            ),
850        ];
851        apply_intent_alignment_gate(
852            &mut scored,
853            "return false instead of panic on invalid input",
854        );
855        assert_eq!(
856            scored
857                .iter()
858                .map(|s| s.skill_id.as_str())
859                .collect::<Vec<_>>(),
860            vec!["return-false-not-panic"],
861            "only the intent-aligned rule should survive the mixed set"
862        );
863    }
864
865    #[test]
866    fn intent_gate_all_weak_query_returns_zero() {
867        // A query with no salient (non-stop-word, ≥3-char) terms gives the gate
868        // nothing to align against. Per DiffLore's "stay silent unless it
869        // clearly applies" bias, that yields zero — no confident match.
870        let mut scored = vec![
871            directive_chunk("a", "Return false rather than panic on invalid input", 0.12),
872            directive_chunk("b", "Use structured errors in request handlers", 0.10),
873        ];
874        // "the and to of" → all stop words; nothing ≥3 chars survives lexical_terms.
875        apply_intent_alignment_gate(&mut scored, "the and to of");
876        assert!(
877            scored.is_empty(),
878            "an all-weak query must return zero, kept: {:?}",
879            scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
880        );
881    }
882
883    #[test]
884    fn intent_gate_exempts_strongly_scored_hits() {
885        // Exact-title-strict / starter / lexically-boosted hits land at or
886        // above the exemption ceiling and are kept regardless of directive
887        // overlap — the strong-match / self-recall non-regression guarantee.
888        let mut scored = vec![ScoredRuleChunk {
889            skill_id: "exact-title-strict".to_owned(),
890            content: "Rule ID: x\nRule Name: Completely unrelated heading\n\nbody".to_owned(),
891            // 2.0 + conf band: an exact-title-strict match.
892            score: 2.7,
893            confidence: 0.7,
894        }];
895        apply_intent_alignment_gate(
896            &mut scored,
897            "return false instead of panic on invalid input",
898        );
899        assert_eq!(
900            scored.len(),
901            1,
902            "a strongly-scored hit must be exempt from the alignment gate"
903        );
904    }
905
906    #[test]
907    fn intent_gate_ratio_path_keeps_short_sharp_query_match() {
908        // A short 2-salient-term intent ("panic safety") whose directive shares
909        // ONE term is below the absolute bar (2) but covers half the query's
910        // salient terms, so the ratio path keeps it — short queries don't
911        // over-prune.
912        let mut scored = vec![directive_chunk(
913            "panic-safety",
914            "Document panic safety for unsafe blocks",
915            0.12,
916        )];
917        apply_intent_alignment_gate(&mut scored, "panic safety");
918        assert_eq!(
919            scored.len(),
920            1,
921            "a half-coverage match on a short query must survive via the ratio path"
922        );
923    }
924
925    #[test]
926    fn intent_gate_empty_input_is_noop() {
927        let mut scored: Vec<ScoredRuleChunk> = Vec::new();
928        apply_intent_alignment_gate(&mut scored, "anything");
929        assert!(scored.is_empty());
930    }
931
932    // -- Iter-2 stricter concern-match tests --
933
934    #[test]
935    fn intent_gate_drops_two_generic_anchor_overlap_without_distinctive_term() {
936        // The precision tightening over iter-1. The OLD gate kept any rule whose
937        // directive shared >=2 query terms. Here a "panic on invalid input"
938        // intent and a runtime-error rule share TWO terms — but both are GENERIC
939        // anchors (`panic`, `error`, `input`) with no specific subject/action
940        // token in common. That is exactly the topical-adjacency the A/B blamed
941        // for the extra false positives, so the hardened gate drops it.
942        let mut scored = vec![directive_chunk(
943            "runtime-error-logging",
944            "Log every panic and error with the request input id",
945            0.12,
946        )];
947        apply_intent_alignment_gate(&mut scored, "panic on invalid input handling");
948        assert!(
949            scored.is_empty(),
950            "an all-generic-anchor overlap must not establish a concern match, kept: {:?}",
951            scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
952        );
953    }
954
955    #[test]
956    fn intent_gate_drops_off_subject_rule_that_namedrops_one_distinctive_token() {
957        // A rule about a DIFFERENT subject that merely name-drops one of the
958        // query's distinctive tokens. The query "validate the auth token before
959        // issuing a session" shares `token` with a CSV-parsing rule, but the
960        // rule's own directive is overwhelmingly about something else, so its
961        // rule-side coverage is far below the floor → dropped. This is the
962        // bidirectional half of the gate: a single shared word inside a rule
963        // about another concern is not a match.
964        let mut scored = vec![directive_chunk(
965            "csv-token-splitting",
966            "Split each CSV row into fields on the comma token boundary carefully",
967            0.12,
968        )];
969        apply_intent_alignment_gate(
970            &mut scored,
971            "validate the auth token before issuing session",
972        );
973        assert!(
974            scored.is_empty(),
975            "a one-token name-drop in an off-subject rule must be dropped, kept: {:?}",
976            scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
977        );
978    }
979
980    #[test]
981    fn intent_gate_keeps_on_subject_rule_with_verbose_body() {
982        // No over-pruning regression: a genuinely on-subject rule whose title
983        // states the concern but whose BODY is long must still be kept. The
984        // rule-side coverage is measured against the TITLE (the core directive),
985        // so the verbose body does not dilute it below the floor.
986        let verbose_body = "When a handler receives malformed input it should return a typed \
987            error to the caller rather than calling panic!, because a panic unwinds the worker \
988            thread and takes down unrelated in-flight requests; prefer Result and propagate. \
989            See the request lifecycle docs and the error-taxonomy appendix for the full list.";
990        let mut scored = vec![ScoredRuleChunk {
991            skill_id: "validate-return-error".to_owned(),
992            content: format!(
993                "Rule ID: r\nRule Name: Validate input and return a typed error not panic\nType: correction\nTags: \n\n{verbose_body}"
994            ),
995            score: 0.12,
996            confidence: 0.7,
997        }];
998        apply_intent_alignment_gate(
999            &mut scored,
1000            "validate input and return error instead of panic",
1001        );
1002        assert_eq!(
1003            scored
1004                .iter()
1005                .map(|s| s.skill_id.as_str())
1006                .collect::<Vec<_>>(),
1007            vec!["validate-return-error"],
1008            "an on-subject rule with a long body must survive (title-scoped coverage)"
1009        );
1010    }
1011
1012    #[test]
1013    fn intent_gate_strictly_subsumes_old_overlap_count_on_anchor_only_match() {
1014        // Anchor-only overlap is rejected, while the distinctive-token sibling
1015        // is kept. Both share the same raw term count; only distinctiveness and
1016        // rule-side coverage separate them.
1017        let intent = "panic on invalid input";
1018        // overlap = {panic(g), input(g)} = 2, distinctive = 0 → DROP under new gate.
1019        let mut anchor_only = vec![directive_chunk(
1020            "anchor-only",
1021            "Buffer every panic and input event into the queue",
1022            0.12,
1023        )];
1024        apply_intent_alignment_gate(&mut anchor_only, intent);
1025        assert!(
1026            anchor_only.is_empty(),
1027            "anchor-only overlap (old gate would keep) must now drop"
1028        );
1029        // overlap = {panic(g), invalid(d)} ⊇ the subject; distinctive = 1 → KEEP.
1030        let mut on_subject = vec![directive_chunk(
1031            "on-subject",
1032            "Reject invalid input instead of letting it panic",
1033            0.12,
1034        )];
1035        apply_intent_alignment_gate(&mut on_subject, intent);
1036        assert_eq!(
1037            on_subject
1038                .iter()
1039                .map(|s| s.skill_id.as_str())
1040                .collect::<Vec<_>>(),
1041            vec!["on-subject"],
1042            "the distinctive-token sibling must be kept"
1043        );
1044    }
1045
1046    #[test]
1047    fn intent_alignment_exempt_score_sits_above_strong_band_below_exact_title() {
1048        let exempt_score = std::hint::black_box(INTENT_ALIGNMENT_EXEMPT_SCORE);
1049        let explicit_floor = std::hint::black_box(EXPLICIT_RECALL_MIN_RELEVANCE);
1050        let exact_title_floor = std::hint::black_box(2.0);
1051        let min_overlap = std::hint::black_box(MIN_INTENT_DIRECTIVE_OVERLAP);
1052
1053        assert!(
1054            exempt_score > explicit_floor,
1055            "exemption ceiling must be above the explicit relevance floor"
1056        );
1057        assert!(
1058            exempt_score < exact_title_floor,
1059            "exemption ceiling must be below the exact-title-strict (2.0 + conf) band"
1060        );
1061        assert!(
1062            min_overlap >= 2,
1063            "a lone topical-anchor overlap must be insufficient"
1064        );
1065    }
1066
1067    #[test]
1068    fn explicit_recall_floors_are_conservative_relative_to_in_retrieval_gates() {
1069        let explicit_relative_floor = std::hint::black_box(EXPLICIT_RECALL_RELATIVE_FLOOR);
1070        let retrieval_relative_floor = std::hint::black_box(RELATIVE_RELEVANCE_FLOOR);
1071        let explicit_min = std::hint::black_box(EXPLICIT_RECALL_MIN_RELEVANCE);
1072        let adaptive_threshold = std::hint::black_box(ADAPTIVE_INJECT_THRESHOLD);
1073        let min_relevance = std::hint::black_box(MIN_RELEVANCE_SCORE);
1074
1075        assert!(
1076            explicit_relative_floor < retrieval_relative_floor,
1077            "explicit relative floor must be looser than the in-retrieval one"
1078        );
1079        assert!(
1080            explicit_min > adaptive_threshold,
1081            "explicit absolute floor must sit above the hook zero-inject threshold"
1082        );
1083        assert!(
1084            explicit_min > min_relevance,
1085            "explicit absolute floor must be stricter than the bare RRF noise floor"
1086        );
1087    }
1088}