difflore_core/context/retrieval/rules.rs
1use crate::context::DEFAULT_TOP_K_RULES;
2use crate::context::ann;
3use crate::context::embedding::cosine_similarity;
4use crate::context::index_db::{self, IndexedRuleChunk, QueryFilter};
5use crate::domain::glob_match::{GlobErrorPolicy, glob_match};
6use crate::errors::CoreError;
7use crate::review_trajectory::{TrajectoryBuilder, TrajectoryStep};
8use sqlx::SqlitePool;
9use std::collections::{HashMap, HashSet};
10use std::time::Duration;
11
12use super::query_embed::embed_query_aligned_to_index;
13use super::scoring::{directive_intent_aligned, effective_confidence, infer_rule_kind};
14use super::{
15 ADAPTIVE_INJECT_THRESHOLD, EXPLICIT_RECALL_MIN_RELEVANCE, EXPLICIT_RECALL_RELATIVE_FLOOR,
16 INTENT_ALIGNMENT_EXEMPT_SCORE, MIN_RELEVANCE_SCORE, RELATIVE_RELEVANCE_FLOOR, RRF_K,
17 ScoredRuleChunk, concreteness_score, lexical_terms,
18};
19
20const MAX_RULE_RETRIEVAL_TOP_K: usize = 50;
21const MAX_ANN_CANDIDATES: usize = 150;
22
23/// Retrieve rules with confidence-weighted ranking.
24/// Final score = hybrid rank score with one final confidence tie-breaker.
25/// Rules with confidence < 0.2 are excluded (likely rejected).
26pub async fn retrieve_rules(
27 index_pool: &SqlitePool,
28 query: &str,
29 top_k: Option<usize>,
30) -> Result<Vec<ScoredRuleChunk>, CoreError> {
31 retrieve_rules_with_confidence(
32 index_pool,
33 query,
34 RetrievalOptions {
35 top_k,
36 ..Default::default()
37 },
38 )
39 .await
40}
41
42/// Decide whether a chunk's `file_patterns` (JSON-encoded glob list) match
43/// the given target file path. Returns `true` if patterns are absent / empty
44/// (universal rule), or if any glob matches. Malformed JSON or an unbuildable
45/// glob set are treated as a match — never silently drop a rule because of a
46/// parse error (over-recall is correct for retrieval). Iter-9 (2026-04-18)
47/// port of cloud `patternAllows`; B8 (shared `glob_match`).
48pub(super) fn pattern_allows(file_patterns_json: Option<&str>, target_file: &str) -> bool {
49 glob_match(file_patterns_json, target_file, GlobErrorPolicy::OverRecall)
50}
51
52/// Retrieve rules with confidence weighting plus hybrid FTS / embedding retrieval.
53///
54/// Retrieval options for confidence weighting, scoping, ANN usage, and
55/// trajectory telemetry. The default path matches plain retrieval: default
56/// top-k, no confidence map, no target-file cascade, no SQL metadata filter,
57/// ANN enabled, and no trajectory capture.
58pub struct RetrievalOptions<'a> {
59 pub top_k: Option<usize>,
60 pub confidence_map: Option<&'a HashMap<String, f64>>,
61 /// Optional allow-list applied before RRF fusion. Callers that already
62 /// know the engine-eligible rule set can pass it here so disabled rules
63 /// do not consume top-k score budget.
64 pub eligible_skill_ids: Option<&'a HashSet<String>>,
65 /// Iter-13 (2026-05-02). Per-skill age in days, used by the
66 /// category-keyed half-life decay in `effective_confidence`. When
67 /// `None` (or a chunk's `skill_id` is absent from the map) the
68 /// scoring site uses `age_days = 0.0` — identical to the
69 /// pre-plumbing behaviour, so no caller breaks if it doesn't pass
70 /// a map.
71 pub age_days_map: Option<&'a HashMap<String, f32>>,
72 pub target_file: Option<&'a str>,
73 pub filter: Option<&'a QueryFilter>,
74 pub ann_enabled: bool,
75 /// Optional provider-call budget for embedding the query. Latency-
76 /// sensitive hook paths set this so a slow cloud embedder degrades to
77 /// lexical retrieval instead of timing out the host agent's hook.
78 pub embedding_timeout: Option<Duration>,
79 /// When true, a query embed that falls back to lexical because the base
80 /// budget timed out on a healthy cloud lane is retried once with a longer
81 /// cold-absorbing budget (see [`COLD_RETRY_EMBEDDING_TIMEOUT`]). Only the
82 /// human-waiting CLI `recall`/`search` path sets this; the latency-critical
83 /// hook/MCP paths leave it `false` so a cold provider never blocks the agent.
84 pub cold_start_retry: bool,
85 /// When true, suppresses broad weak matches entirely. This is useful
86 /// for unsolicited hook injection where "no extra context" is often
87 /// better than five noisy rules. Explicit user/tool queries should
88 /// leave this false so a search never looks broken just because the
89 /// best match is weak.
90 pub adaptive_prune: bool,
91 pub trajectory: Option<&'a mut TrajectoryBuilder>,
92}
93
94impl Default for RetrievalOptions<'_> {
95 fn default() -> Self {
96 Self {
97 top_k: None,
98 confidence_map: None,
99 eligible_skill_ids: None,
100 age_days_map: None,
101 target_file: None,
102 filter: None,
103 ann_enabled: true,
104 embedding_timeout: None,
105 cold_start_retry: false,
106 adaptive_prune: false,
107 trajectory: None,
108 }
109 }
110}
111
112/// Retrieve rules with confidence weighting plus hybrid FTS / embedding retrieval.
113///
114/// `confidence_map` maps `skill_id` -> `confidence_score`. If None, all rules
115/// get default confidence 0.7.
116///
117/// `target_file`: when present, applies **strict cascade** — chunks whose
118/// `file_patterns` don't match the target file are dropped before scoring.
119/// When the matched bucket is empty, returns no pattern-scoped rules rather
120/// than widening into rules explicitly tagged for other files.
121///
122/// `filter`: metadata pre-filter applied at SQL time (C2). Empty filter
123/// means "no scoping" — retrieval sees every chunk.
124///
125/// `trajectory`: optional builder that captures RRF / filter statistics
126/// for the cloud dashboard. Passing `None` disables telemetry.
127pub async fn retrieve_rules_with_confidence(
128 index_pool: &SqlitePool,
129 query: &str,
130 options: RetrievalOptions<'_>,
131) -> Result<Vec<ScoredRuleChunk>, CoreError> {
132 let RetrievalOptions {
133 top_k,
134 confidence_map,
135 eligible_skill_ids,
136 age_days_map,
137 target_file,
138 filter,
139 ann_enabled,
140 embedding_timeout,
141 cold_start_retry,
142 adaptive_prune,
143 trajectory,
144 } = options;
145 let default_filter = QueryFilter::default();
146 let filter = filter.unwrap_or(&default_filter);
147 let requested_k = top_k.unwrap_or(DEFAULT_TOP_K_RULES);
148 if requested_k == 0 {
149 return Ok(Vec::new());
150 }
151 let k = requested_k.min(MAX_RULE_RETRIEVAL_TOP_K);
152 let retrieval_start = std::time::Instant::now();
153 let embedded_query =
154 embed_query_aligned_to_index(index_pool, query, embedding_timeout, cold_start_retry).await;
155 let query_emb = embedded_query.vector;
156
157 // Switch the RRF weighting when the actual query vector is only the
158 // local lexical hash, or when a provider failure disabled the vector
159 // lane entirely.
160 //
161 // 2026-05-03 A/B verified the hybrid (local hash + FTS5 BM25) lifts
162 // self-recall@5 from 45% (FTS-only) to 85%, and @1 from 10% to 45%.
163 // The local hash isn't semantic but its bag-of-token overlap fills
164 // FTS5's strict-tokenizer gap. Worth keeping until cloud-managed
165 // embedding is configured.
166 let is_semantic = embedded_query.semantic;
167
168 // ── C2: SQL-level metadata pre-filter ──────────────────────────
169 // When the filter is empty this reduces to `SELECT *`, matching the
170 // pre-C2 behaviour and so zero-cost for unscoped callers.
171 let unfiltered_count: u32 = if filter.is_empty() {
172 0
173 } else {
174 sqlx::query_scalar!(r#"SELECT COUNT(*) as "n!: i64" FROM rule_chunks"#)
175 .fetch_one(index_pool)
176 .await
177 .unwrap_or(0)
178 .try_into()
179 .unwrap_or(u32::MAX)
180 };
181 let chunks = index_db::query_rule_chunks(index_pool, filter).await?;
182 let after_count: u32 = u32::try_from(chunks.len()).unwrap_or(u32::MAX);
183
184 // ── C4: FTS5 keyword baseline ──────────────────────────────────
185 // Pull `k*4` raw hits so we have RRF material even after the
186 // pattern cascade trims some out.
187 let fts_limit = k.saturating_mul(4).min(200).max(k);
188 let fts_hits = index_db::fts_search(index_pool, query, filter, fts_limit)
189 .await
190 .unwrap_or_default();
191
192 let default_confidence = 0.7;
193 let min_confidence = 0.2;
194
195 // Pre-partition by file-pattern match if target_file is set. This is the
196 // strict cascade: when ANY chunk matches the target, drop the rest.
197 let matched: Vec<&IndexedRuleChunk> = if let Some(tf) = target_file {
198 chunks
199 .iter()
200 .filter(|c| pattern_allows(c.file_patterns.as_deref(), tf))
201 .collect()
202 } else {
203 chunks.iter().collect()
204 };
205 let active: &[&IndexedRuleChunk] = if target_file.is_some() && matched.is_empty() {
206 &[]
207 } else {
208 &matched
209 };
210
211 // Build a lookup table so FTS hits (identified by chunk id) can be
212 // reconciled against the cascade-filtered active set.
213 let id_to_chunk: HashMap<&str, &IndexedRuleChunk> =
214 active.iter().map(|c| (c.id.as_str(), *c)).collect();
215
216 // ── Embedding-ranked candidate list ───────────────────────────
217 //
218 // Try the HNSW ANN path first. It returns a small candidate set that is
219 // intersected with the metadata-filtered `active` set. On any failure, fall
220 // back to the linear cosine scan.
221 let ann_candidates = k.saturating_mul(3).min(MAX_ANN_CANDIDATES).max(k);
222 let ann_result = if ann_enabled {
223 try_ann_rank(
224 &query_emb,
225 ann_candidates,
226 &id_to_chunk,
227 confidence_map,
228 eligible_skill_ids,
229 default_confidence,
230 min_confidence,
231 )
232 .await
233 } else {
234 None
235 };
236
237 let (mut emb_ranked, ann_used, ann_index_size, ann_returned): (
238 Vec<(&IndexedRuleChunk, f64)>,
239 bool,
240 u32,
241 u32,
242 ) = if let Some((ranked, idx_size, returned)) = ann_result {
243 (ranked, true, idx_size, returned)
244 } else {
245 let fallback: Vec<(&IndexedRuleChunk, f64)> = active
246 .iter()
247 .filter_map(|c: &&IndexedRuleChunk| {
248 if !eligible_skill_ids.is_none_or(|ids| ids.contains(&c.skill_id)) {
249 return None;
250 }
251 let confidence = confidence_map
252 .and_then(|m| m.get(&c.skill_id).copied())
253 .unwrap_or(default_confidence);
254 if confidence < min_confidence {
255 return None;
256 }
257 if query_emb.len() != c.embedding.len() {
258 return None;
259 }
260 let sim = cosine_similarity(&query_emb, &c.embedding);
261 Some((*c, f64::from(sim)))
262 })
263 .collect();
264 (fallback, false, 0, 0)
265 };
266 emb_ranked.sort_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.id.cmp(&b.0.id)));
267
268 let emb_rank_map: HashMap<&str, usize> = emb_ranked
269 .iter()
270 .enumerate()
271 .map(|(i, (c, _))| (c.id.as_str(), i))
272 .collect();
273
274 // ── FTS rank map (only keeps hits that survived the cascade). ──
275 let mut fts_rank_map: HashMap<&str, usize> = HashMap::new();
276 let mut fts_kept = 0u32;
277 for (i, (id, _)) in fts_hits.iter().enumerate() {
278 if id_to_chunk.contains_key(id.as_str()) {
279 fts_rank_map.insert(id.as_str(), i);
280 fts_kept += 1;
281 }
282 }
283
284 // Overlap metric for telemetry — how many ids were ranked by BOTH
285 // paths. High overlap → paths agree; low overlap → they're
286 // surfacing complementary results (the whole point of hybrid).
287 let overlap: u32 = {
288 let fts_ids: HashSet<&str> = fts_rank_map.keys().copied().collect();
289 let emb_ids: HashSet<&str> = emb_rank_map.keys().copied().collect();
290 u32::try_from(fts_ids.intersection(&emb_ids).count()).unwrap_or(u32::MAX)
291 };
292
293 // ── RRF fusion ────────────────────────────────────────────────
294 //
295 // score(chunk) = w_emb * 1/(k+rank_emb) + w_fts * 1/(k+rank_fts)
296 //
297 // When the embedder is not semantic, skew toward the FTS baseline because
298 // local SHA1 is noise-dominated.
299 let (w_emb, w_fts) = if is_semantic { (0.5, 0.5) } else { (0.2, 0.8) };
300
301 let mut fused: HashMap<&str, (f64, &IndexedRuleChunk, f64 /*confidence*/)> = HashMap::new();
302 // `_sim` is unused directly in RRF (ranks already encode it). The
303 // raw score is kept in the vector only to sort embedding candidates
304 // before assigning reciprocal ranks.
305 for (chunk, _sim) in &emb_ranked {
306 let rank = emb_rank_map.get(chunk.id.as_str()).copied().unwrap_or(0);
307 let contrib = w_emb / (RRF_K + rank as f64 + 1.0);
308 let confidence = confidence_map
309 .and_then(|m| m.get(&chunk.skill_id).copied())
310 .unwrap_or(default_confidence);
311 fused
312 .entry(chunk.id.as_str())
313 .and_modify(|e| e.0 += contrib)
314 .or_insert((contrib, *chunk, confidence));
315 }
316 for (id, rank) in &fts_rank_map {
317 if let Some(chunk) = id_to_chunk.get(id) {
318 if !eligible_skill_ids.is_none_or(|ids| ids.contains(&chunk.skill_id)) {
319 continue;
320 }
321 let contrib = w_fts / (RRF_K + *rank as f64 + 1.0);
322 let confidence = confidence_map
323 .and_then(|m| m.get(&chunk.skill_id).copied())
324 .unwrap_or(default_confidence);
325 if confidence < min_confidence {
326 continue;
327 }
328 fused
329 .entry(id)
330 .and_modify(|e| e.0 += contrib)
331 .or_insert((contrib, *chunk, confidence));
332 }
333 }
334
335 // ── Emit trajectory telemetry (best-effort, never blocks recall) ──
336 if let Some(t) = trajectory {
337 if !filter.is_empty() {
338 t.push(TrajectoryStep::RetrievalFilter {
339 before: unfiltered_count,
340 after: after_count,
341 });
342 }
343 t.push(TrajectoryStep::AnnRecall {
344 used: ann_used,
345 index_size: ann_index_size,
346 candidates: ann_returned,
347 });
348 t.push(TrajectoryStep::HybridFusion {
349 fts_hits: fts_kept,
350 emb_hits: u32::try_from(emb_ranked.len()).unwrap_or(u32::MAX),
351 overlap,
352 });
353 }
354
355 // Materialise the final scored list. `ScoredRuleChunk.score` is
356 // the fused RRF score multiplied by a *small* confidence multiplier
357 // — confidence acts as a tie-breaker rather than the primary
358 // ranking signal. The earlier `sqrt(confidence)` weight flipped
359 // the ordering on real workloads: a freshly captured (conf=0.6)
360 // conversation rule with a strong file-pattern + lexical match was
361 // demoted below cloud-extracted rules (conf=0.7) whose query
362 // overlap was 5-20% lower. Net result: the rule the user just
363 // taught DiffLore was the LAST rule injected for the very file it
364 // applies to. That breaks the slogan ("AI understands your preferences better and better") at
365 // exactly the moment users will check whether the slogan is true.
366 //
367 // The 0.9 + 0.1 * confidence multiplier keeps spread at 8%
368 // (conf=0.2 floor → 0.92; conf=1.0 → 1.0). RRF score gaps between
369 // adjacent ranks in our regime are 5-20%, so confidence can break
370 // a near-tie but cannot overturn a clear lexical/semantic winner.
371 // Strengthening (+0.05 confidence per accept) still earns +0.5%
372 // multiplier — enough to win against an equally-relevant peer at
373 // a lower confidence, which is what "the rule I've ratified twice
374 // outranks the rule captured once" should feel like.
375 let mut scored: Vec<ScoredRuleChunk> = fused
376 .into_values()
377 .map(|(score, chunk, confidence)| {
378 // Confidence tie-breaker (8% spread) + content-concreteness
379 // boost. Iter-12 (2026-04-25) added the concreteness factor
380 // because rule-impact-by-kind audit showed slogan rules
381 // ("Trust CI for workflow correctness", "Hold clean PRs for
382 // additional review") were misfiring across languages —
383 // they have no concrete code tokens to anchor relevance to.
384 // The concreteness signal counts backticked tokens + path-
385 // like fragments + version literals in the rule's content,
386 // saturated at 6 hits to avoid runaway when a rule body is
387 // mostly code. Net: a singleton rule citing
388 // `useQuery({...})` outranks a generic slogan with the
389 // same lexical match, fixing the Python −0.38 over-engineer
390 // regime we measured in iter 9.6.
391 // Iter-13 (2026-05-02). Borrow jcode's category-keyed half-life
392 // so an ancient style rule no longer outranks a freshly ratified
393 // correction on conf alone. Kind is inferred from chunk content
394 // (no `kind` column on rule_chunks); age_days comes from the
395 // optional per-call `age_days_map` (None ⇒ 0.0 ⇒ no decay,
396 // matching the original behaviour for callers that haven't
397 // wired the map yet).
398 let kind = infer_rule_kind(&chunk.content);
399 let age_days = age_days_map
400 .and_then(|m| m.get(&chunk.skill_id).copied())
401 .unwrap_or(0.0);
402 let eff_conf = f64::from(effective_confidence(confidence as f32, &kind, age_days));
403 let conf_weight = 0.1f64.mul_add(eff_conf.clamp(0.0, 1.0), 0.9);
404 let conc = concreteness_score(&chunk.content);
405 // Each concreteness "point" adds 5% to score, capped at +30%.
406 let conc_weight = 0.05f64.mul_add(conc.min(6) as f64, 1.0);
407 ScoredRuleChunk {
408 skill_id: chunk.skill_id.clone(),
409 content: chunk.content.clone(),
410 score: score * conf_weight * conc_weight,
411 confidence,
412 }
413 })
414 .collect();
415 scored.sort_by(|a, b| {
416 b.score
417 .total_cmp(&a.score)
418 .then_with(|| a.skill_id.cmp(&b.skill_id))
419 });
420
421 // Adaptive top-K + noise floor.
422 //
423 // Iter-12 hardens the "less is more" principle on top of iter-4's
424 // floors. The fastapi/Python regression (-0.38 ΔB-A in iter 9.6)
425 // traced to the agent receiving 5 weak rules on simple tasks (typo
426 // fix, parameter substitution) where claude's training already
427 // nailed the answer. Five weak rules induced over-engineering. The
428 // fix: when the top result's score itself is in the noise band,
429 // emit ZERO rules — let the agent trust its training.
430 //
431 // Adaptive zero-inject is **only safe for unsolicited
432 // injection** (PreToolUse:Read hook). Explicit user queries via
433 // Explicit canonical MCP rule-search calls must always
434 // return what's available — when a user types `search_rules
435 // intent=...`, returning empty would feel broken even if scores
436 // are weak. Callers opt in by setting `top_k=Some(5)` AND wanting
437 // adaptive behaviour explicitly via the iter-12 hook contract.
438 //
439 // The rule of thumb: if only the absolute floor would have kept
440 // ≥3 results in scope (i.e. there's a real "noise tail" worth
441 // pruning), apply adaptive. Tiny corpora with 1-2 candidates
442 // bypass adaptive — those results are fine to return as-is.
443 // Adaptive zero-inject only when we'd otherwise return many weak
444 // matches (the "5 weak rules" pathology). Small corpora and small
445 // result sets bypass — those are explicit user queries with
446 // limited candidates anyway.
447 let adaptive_eligible = adaptive_prune && scored.len() >= 5;
448 if let Some(top_score) = scored.first().map(|s| s.score) {
449 if adaptive_eligible && top_score < ADAPTIVE_INJECT_THRESHOLD {
450 // Top match is itself weak AND we have many results — this
451 // is the "5 weak rules" pathology. Return empty.
452 scored.clear();
453 } else {
454 prune_below_floors(&mut scored, top_score);
455
456 // Adaptive K: when many results cluster within 60% of the
457 // top, agent can't tell signal from noise — return just
458 // the clearly-strong ones. Skip when result set is tiny
459 // (already informative).
460 if adaptive_eligible {
461 let strong_floor = top_score * 0.60;
462 let strong_count = scored
463 .iter()
464 .take_while(|s| s.score >= strong_floor)
465 .count();
466 if strong_count > 0 && strong_count < scored.len() {
467 scored.truncate(strong_count.min(k));
468 }
469 }
470 }
471 }
472
473 scored.truncate(k);
474
475 // Memory-pipeline event: surfaces the ANN/embedding pass to the TUI
476 // Activity tab so users can see retrieval running. Best-effort —
477 // never blocks recall.
478 crate::activity_stream::record(
479 crate::activity_stream::ActivityPayload::RetrievalEmbedding {
480 hits: u32::try_from(scored.len()).unwrap_or(u32::MAX),
481 took_ms: u64::try_from(retrieval_start.elapsed().as_millis()).unwrap_or(u64::MAX),
482 },
483 );
484
485 Ok(scored)
486}
487
488/// Drop the RRF noise tail from an already-sorted (descending) scored
489/// list using the two floors that have always guarded retrieval: the
490/// absolute [`MIN_RELEVANCE_SCORE`] (RRF rounding noise / cascade-only
491/// admits) AND the relative [`RELATIVE_RELEVANCE_FLOOR`] fraction of the
492/// top hit (the "everything scored 0.02" flat-distribution pathology).
493///
494/// Factored out of `retrieve_rules_with_confidence` so the same retain
495/// is shared with the explicit-recall gate below and is unit-testable in
496/// isolation. Pure: mutates `scored` in place, never re-sorts (the caller
497/// has already sorted), so `top_score` must be the current leader's score.
498fn prune_below_floors(scored: &mut Vec<ScoredRuleChunk>, top_score: f64) {
499 let relative_floor = top_score * RELATIVE_RELEVANCE_FLOOR;
500 scored.retain(|s| s.score > MIN_RELEVANCE_SCORE && s.score >= relative_floor);
501}
502
503/// Adaptive relevance gate for the EXPLICIT recall surfaces — the MCP
504/// `search_rules` tool and the CLI `recall` command. Mirrors the hook
505/// path's adaptive pruning so an agent never has to weigh five weak rules
506/// against an empty answer: irrelevant memory is worse than none.
507///
508/// The hook path (`adaptive_prune == true` inside
509/// `retrieve_rules_with_confidence`) zero-injects on a weak top hit and
510/// drops the noise tail *before* any downstream reranking. The explicit
511/// paths can't do that in-retrieval because they still add high-value
512/// signals after fusion — exact-title-strict matches (score `2.0 + conf`),
513/// the cross-repo starter set, and the lexical-intent re-rank boost — so
514/// this gate runs on the FINAL, fully-reranked, sorted list instead. The
515/// net contract is the same as the hook's: a low-relevance query
516/// (wrong-file, no intent overlap — e.g. a Codecov rule surfacing in a
517/// wrong-file top-3) collapses to ZERO results so the caller emits its
518/// existing "no relevant memory" message rather than confident filler.
519///
520/// Two conservative gates, tuned so genuinely-strong matches are NEVER
521/// suppressed:
522/// 1. Absolute floor — if even the top hit is below
523/// [`EXPLICIT_RECALL_MIN_RELEVANCE`], every result is noise: clear.
524/// After the lexical-intent re-rank a genuinely relevant top hit
525/// sits far above this floor (boosted into the 0.1+ range), while a
526/// cascade-only / no-overlap top hit stays in the raw RRF band
527/// (~0.001–0.005) and is correctly dropped.
528/// 2. Relative floor — drop tail results below
529/// [`EXPLICIT_RECALL_RELATIVE_FLOOR`] of the (surviving) top hit, so
530/// a strong leader doesn't drag along far-weaker filler. Deliberately
531/// looser than the hook's [`RELATIVE_RELEVANCE_FLOOR`]: explicit
532/// queries should keep more of a real result set, only shedding the
533/// clearly-irrelevant tail.
534///
535/// Pure and in-place. The caller must pass a list already sorted
536/// descending by `score` (both explicit call sites do, via their final
537/// re-rank). Strong matches (including exact-title-strict and starter
538/// hits) clear both floors by a wide margin, so this never regresses a
539/// real recall.
540pub fn apply_explicit_recall_threshold(scored: &mut Vec<ScoredRuleChunk>) {
541 let Some(top_score) = scored.first().map(|s| s.score) else {
542 return;
543 };
544 // Absolute floor: the best match itself is noise → return nothing.
545 if top_score < EXPLICIT_RECALL_MIN_RELEVANCE {
546 scored.clear();
547 return;
548 }
549 // Relative floor: shed the tail far below the leader.
550 let relative_floor = top_score * EXPLICIT_RECALL_RELATIVE_FLOOR;
551 scored.retain(|s| s.score >= relative_floor);
552}
553
554/// Intent-alignment gate for the EXPLICIT recall surfaces — applied BEFORE
555/// [`apply_explicit_recall_threshold`] on the final, fully-reranked list.
556///
557/// WHY: topically adjacent rules can clear relevance floors while addressing a
558/// different action or subject than the directive. This gate adds the missing
559/// axis: does the rule's directive match the query intent, not just its topic?
560///
561/// Behaviour, biased hard toward FEWER / zero (DiffLore's "stay silent
562/// unless it clearly applies" positioning):
563/// * An all-weak query (no salient terms after stop-word filtering) cannot
564/// establish intent for ANY rule → clear. Returning nothing is correct
565/// here: we have no signal to claim a match.
566/// * A candidate is KEPT when it is either strongly scored (≥
567/// [`INTENT_ALIGNMENT_EXEMPT_SCORE`] — exact-title-strict / starter /
568/// strongly lexically-boosted hits, already intent-validated upstream)
569/// or its directive is intent-aligned per [`directive_intent_aligned`].
570/// * Every other candidate — the topically-adjacent middle band — is
571/// dropped.
572///
573/// Conservative by construction: the strong-score exemption guarantees no
574/// genuinely-strong match (and therefore no eval self-recall hit, where the
575/// query is the rule's own intent text and overlap is near-total) is ever
576/// suppressed. Pure / in-place; order is preserved (the caller has already
577/// sorted, and this only `retain`s).
578pub fn apply_intent_alignment_gate(scored: &mut Vec<ScoredRuleChunk>, intent: &str) {
579 if scored.is_empty() {
580 return;
581 }
582 let query_terms = lexical_terms(intent);
583 if query_terms.is_empty() {
584 // No salient intent to align against — per the "fewer / zero"
585 // bias, an unscorable intent yields no confident matches.
586 scored.clear();
587 return;
588 }
589 scored.retain(|chunk| {
590 chunk.score >= INTENT_ALIGNMENT_EXEMPT_SCORE
591 || directive_intent_aligned(&chunk.content, &query_terms)
592 });
593}
594
595/// Attempt the HNSW ANN ranking path for the current project.
596///
597/// Returns `Some((ranked, index_size, returned))` on a successful ANN
598/// lookup that produced at least one candidate inside the
599/// metadata-filtered `active` set. Returns `None` on any of:
600/// - empty / missing on-disk index
601/// - dim mismatch between query and stored vectors
602/// - ANN search yielded zero usable candidates (e.g. all hits were
603/// tombstoned or outside the active filter)
604/// - any internal error talking to the ANN cache
605///
606/// The caller MUST treat `None` as "use the linear cosine scan". This
607/// is the safety net that guarantees retrieval keeps working when the
608/// HNSW index is absent or stale.
609async fn try_ann_rank<'a>(
610 query_emb: &[f32],
611 candidates: usize,
612 id_to_chunk: &HashMap<&'a str, &'a IndexedRuleChunk>,
613 confidence_map: Option<&HashMap<String, f64>>,
614 eligible_skill_ids: Option<&HashSet<String>>,
615 default_confidence: f64,
616 min_confidence: f64,
617) -> Option<(Vec<(&'a IndexedRuleChunk, f64)>, u32, u32)> {
618 if query_emb.is_empty() || candidates == 0 {
619 return None;
620 }
621 // Resolve the project hash from the current working directory. The
622 // ANN cache is keyed on this hash so MCP calls running in the same
623 // project share one graph across calls. Retrieval call sites that
624 // run outside a project root (unit tests in a tempdir) will still
625 // get a valid hash — they just won't have a persisted graph to
626 // reload, which is fine: `load_or_empty` returns an empty index and
627 // we fall through to the linear scan.
628 let project_root = crate::db::current_project_root();
629 let project_hash = crate::db::project_hash_from_root(&project_root);
630
631 let ann_arc = ann::get_ann_for_project(&project_hash, query_emb.len())
632 .await
633 .ok()?;
634 let ann_guard = ann_arc.lock().await;
635 let index_size = ann_guard.live_size();
636 if index_size == 0 {
637 return None;
638 }
639 let hits = ann_guard.search(query_emb, candidates);
640 if hits.is_empty() {
641 return None;
642 }
643 let returned = u32::try_from(hits.len()).unwrap_or(u32::MAX);
644
645 // Translate the ANN hit set back into `&IndexedRuleChunk` + RRF
646 // score. The score we carry is raw cosine similarity so confidence
647 // is applied at exactly one ranking site (the final tie-breaker).
648 // DistCosine returns `1 - cos`, so cosine similarity is `1 - distance`.
649 let mut ranked: Vec<(&IndexedRuleChunk, f64)> = Vec::with_capacity(hits.len());
650 for (chunk_id, distance) in hits {
651 let Some(chunk) = id_to_chunk.get(chunk_id.as_str()) else {
652 // Hit lives in the graph but didn't survive the metadata
653 // pre-filter — drop it.
654 continue;
655 };
656 if !eligible_skill_ids.is_none_or(|ids| ids.contains(&chunk.skill_id)) {
657 continue;
658 }
659 let confidence = confidence_map
660 .and_then(|m| m.get(&chunk.skill_id).copied())
661 .unwrap_or(default_confidence);
662 if confidence < min_confidence {
663 continue;
664 }
665 let sim = (1.0 - f64::from(distance)).max(0.0);
666 ranked.push((*chunk, sim));
667 }
668 if ranked.is_empty() {
669 // ANN surfaced hits but none survived the filter — treat as a
670 // miss so the linear scan can try to find something.
671 return None;
672 }
673 Some((ranked, index_size, returned))
674}
675
676#[cfg(test)]
677mod tests {
678 use super::super::MIN_INTENT_DIRECTIVE_OVERLAP;
679 use super::*;
680
681 fn chunk(id: &str, score: f64) -> ScoredRuleChunk {
682 ScoredRuleChunk {
683 skill_id: id.to_owned(),
684 content: format!("Rule ID: {id}\nRule Name: {id}\n\nbody"),
685 score,
686 confidence: 0.7,
687 }
688 }
689
690 #[test]
691 fn explicit_recall_threshold_strong_top_hit_survives() {
692 // A genuinely strong match (lexically boosted into the 0.1+ band)
693 // must always survive — the gate is conservative and never
694 // suppresses real recall.
695 let mut scored = vec![chunk("strong", 0.30), chunk("supporting", 0.12)];
696 apply_explicit_recall_threshold(&mut scored);
697 assert_eq!(scored.len(), 2, "strong matches must not be pruned");
698 assert_eq!(scored[0].skill_id, "strong");
699 }
700
701 #[test]
702 fn explicit_recall_threshold_all_weak_returns_empty() {
703 // Wrong-file / low-relevance query: even the top hit is in the raw
704 // fused RRF noise band, so the whole set is filler and should return
705 // zero results.
706 let mut scored = vec![
707 chunk("noise-1", 0.004),
708 chunk("noise-2", 0.003),
709 chunk("noise-3", 0.002),
710 chunk("noise-4", 0.0015),
711 chunk("noise-5", 0.001),
712 ];
713 apply_explicit_recall_threshold(&mut scored);
714 assert!(
715 scored.is_empty(),
716 "a query whose only matches are weak must return zero results"
717 );
718 }
719
720 #[test]
721 fn explicit_recall_threshold_borderline_keeps_only_strong() {
722 // Borderline set: one clear leader well above the absolute floor,
723 // plus tail rules far below it. The leader (and anything within the
724 // relative band) survives; the far-below-leader tail is dropped.
725 let mut scored = vec![
726 chunk("leader", 0.40),
727 chunk("near", 0.10), // 25% of leader — within the 0.20 relative floor
728 chunk("tail-1", 0.05), // 12.5% of leader — dropped
729 chunk("tail-2", 0.02),
730 chunk("tail-3", 0.011),
731 ];
732 apply_explicit_recall_threshold(&mut scored);
733 let ids: Vec<&str> = scored.iter().map(|s| s.skill_id.as_str()).collect();
734 assert_eq!(
735 ids,
736 vec!["leader", "near"],
737 "only the leader and rules within the relative band survive"
738 );
739 }
740
741 #[test]
742 fn explicit_recall_threshold_top_hit_at_absolute_floor_is_kept() {
743 // A top hit exactly at the absolute floor is NOT below it, so it
744 // survives — proving the gate suppresses only genuine sub-floor
745 // noise, never a borderline-but-present match.
746 let mut scored = vec![chunk("at-floor", EXPLICIT_RECALL_MIN_RELEVANCE)];
747 apply_explicit_recall_threshold(&mut scored);
748 assert_eq!(scored.len(), 1, "top hit at the floor must be kept");
749 }
750
751 #[test]
752 fn explicit_recall_threshold_empty_input_is_noop() {
753 let mut scored: Vec<ScoredRuleChunk> = Vec::new();
754 apply_explicit_recall_threshold(&mut scored);
755 assert!(scored.is_empty());
756 }
757
758 // -- Intent-alignment gate tests (precision fix) --
759
760 /// Build a candidate whose distilled directive is its `Rule Name:` title.
761 /// `score` is left in the moderate (gated) band by default so the gate's
762 /// alignment check — not the strong-score exemption — decides its fate.
763 fn directive_chunk(id: &str, title: &str, score: f64) -> ScoredRuleChunk {
764 ScoredRuleChunk {
765 skill_id: id.to_owned(),
766 content: format!(
767 "Rule ID: {id}\nRule Name: {title}\nType: convention\nTags: \n\n{title}."
768 ),
769 score,
770 confidence: 0.7,
771 }
772 }
773
774 #[test]
775 fn intent_gate_drops_topically_adjacent_different_subject_rule() {
776 // The diagnosed failure: a "return false vs panic" directive recalls a
777 // panic-MESSAGE-wording rule and a test-timing rule. Both share the
778 // file area / topical anchor ("panic"/"test") but address a DIFFERENT
779 // action+subject than the query, so the agent gets distracted. Each is
780 // dropped because its directive shares <2 of the query's salient terms
781 // (and <half of them).
782 let mut scored = vec![
783 directive_chunk(
784 "panic-message-wording",
785 "Panic messages should describe the violated invariant",
786 0.12,
787 ),
788 directive_chunk(
789 "test-timing",
790 "Avoid sleep-based waits in tests; poll for the condition",
791 0.10,
792 ),
793 ];
794 apply_intent_alignment_gate(
795 &mut scored,
796 "return false instead of panic on invalid input",
797 );
798 assert!(
799 scored.is_empty(),
800 "topically-adjacent, wrong-subject rules must be dropped, kept: {:?}",
801 scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
802 );
803 }
804
805 #[test]
806 fn intent_gate_keeps_directly_on_subject_rule() {
807 // The on-subject rule shares the action verb AND its object
808 // (return + false + panic + input), clearing the absolute-overlap
809 // bar, so it survives even at a moderate (non-exempt) score.
810 let mut scored = vec![directive_chunk(
811 "return-false-not-panic",
812 "Return false rather than panic on invalid input",
813 0.12,
814 )];
815 apply_intent_alignment_gate(
816 &mut scored,
817 "return false instead of panic on invalid input",
818 );
819 assert_eq!(
820 scored
821 .iter()
822 .map(|s| s.skill_id.as_str())
823 .collect::<Vec<_>>(),
824 vec!["return-false-not-panic"],
825 "a directly-on-subject directive must survive the intent gate"
826 );
827 }
828
829 #[test]
830 fn intent_gate_keeps_on_subject_drops_adjacent_in_same_set() {
831 // The realistic mixed set the A/B saw: the on-subject rule plus the two
832 // topically-adjacent distractors, all admitted by hybrid retrieval.
833 // The gate keeps only the aligned one.
834 let mut scored = vec![
835 directive_chunk(
836 "return-false-not-panic",
837 "Return false rather than panic on invalid input",
838 0.12,
839 ),
840 directive_chunk(
841 "panic-message-wording",
842 "Panic messages should describe the violated invariant",
843 0.11,
844 ),
845 directive_chunk(
846 "test-timing",
847 "Avoid sleep-based waits in tests; poll for the condition",
848 0.10,
849 ),
850 ];
851 apply_intent_alignment_gate(
852 &mut scored,
853 "return false instead of panic on invalid input",
854 );
855 assert_eq!(
856 scored
857 .iter()
858 .map(|s| s.skill_id.as_str())
859 .collect::<Vec<_>>(),
860 vec!["return-false-not-panic"],
861 "only the intent-aligned rule should survive the mixed set"
862 );
863 }
864
865 #[test]
866 fn intent_gate_all_weak_query_returns_zero() {
867 // A query with no salient (non-stop-word, ≥3-char) terms gives the gate
868 // nothing to align against. Per DiffLore's "stay silent unless it
869 // clearly applies" bias, that yields zero — no confident match.
870 let mut scored = vec![
871 directive_chunk("a", "Return false rather than panic on invalid input", 0.12),
872 directive_chunk("b", "Use structured errors in request handlers", 0.10),
873 ];
874 // "the and to of" → all stop words; nothing ≥3 chars survives lexical_terms.
875 apply_intent_alignment_gate(&mut scored, "the and to of");
876 assert!(
877 scored.is_empty(),
878 "an all-weak query must return zero, kept: {:?}",
879 scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
880 );
881 }
882
883 #[test]
884 fn intent_gate_exempts_strongly_scored_hits() {
885 // Exact-title-strict / starter / lexically-boosted hits land at or
886 // above the exemption ceiling and are kept regardless of directive
887 // overlap — the strong-match / self-recall non-regression guarantee.
888 let mut scored = vec![ScoredRuleChunk {
889 skill_id: "exact-title-strict".to_owned(),
890 content: "Rule ID: x\nRule Name: Completely unrelated heading\n\nbody".to_owned(),
891 // 2.0 + conf band: an exact-title-strict match.
892 score: 2.7,
893 confidence: 0.7,
894 }];
895 apply_intent_alignment_gate(
896 &mut scored,
897 "return false instead of panic on invalid input",
898 );
899 assert_eq!(
900 scored.len(),
901 1,
902 "a strongly-scored hit must be exempt from the alignment gate"
903 );
904 }
905
906 #[test]
907 fn intent_gate_ratio_path_keeps_short_sharp_query_match() {
908 // A short 2-salient-term intent ("panic safety") whose directive shares
909 // ONE term is below the absolute bar (2) but covers half the query's
910 // salient terms, so the ratio path keeps it — short queries don't
911 // over-prune.
912 let mut scored = vec![directive_chunk(
913 "panic-safety",
914 "Document panic safety for unsafe blocks",
915 0.12,
916 )];
917 apply_intent_alignment_gate(&mut scored, "panic safety");
918 assert_eq!(
919 scored.len(),
920 1,
921 "a half-coverage match on a short query must survive via the ratio path"
922 );
923 }
924
925 #[test]
926 fn intent_gate_empty_input_is_noop() {
927 let mut scored: Vec<ScoredRuleChunk> = Vec::new();
928 apply_intent_alignment_gate(&mut scored, "anything");
929 assert!(scored.is_empty());
930 }
931
932 // -- Iter-2 stricter concern-match tests --
933
934 #[test]
935 fn intent_gate_drops_two_generic_anchor_overlap_without_distinctive_term() {
936 // The precision tightening over iter-1. The OLD gate kept any rule whose
937 // directive shared >=2 query terms. Here a "panic on invalid input"
938 // intent and a runtime-error rule share TWO terms — but both are GENERIC
939 // anchors (`panic`, `error`, `input`) with no specific subject/action
940 // token in common. That is exactly the topical-adjacency the A/B blamed
941 // for the extra false positives, so the hardened gate drops it.
942 let mut scored = vec![directive_chunk(
943 "runtime-error-logging",
944 "Log every panic and error with the request input id",
945 0.12,
946 )];
947 apply_intent_alignment_gate(&mut scored, "panic on invalid input handling");
948 assert!(
949 scored.is_empty(),
950 "an all-generic-anchor overlap must not establish a concern match, kept: {:?}",
951 scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
952 );
953 }
954
955 #[test]
956 fn intent_gate_drops_off_subject_rule_that_namedrops_one_distinctive_token() {
957 // A rule about a DIFFERENT subject that merely name-drops one of the
958 // query's distinctive tokens. The query "validate the auth token before
959 // issuing a session" shares `token` with a CSV-parsing rule, but the
960 // rule's own directive is overwhelmingly about something else, so its
961 // rule-side coverage is far below the floor → dropped. This is the
962 // bidirectional half of the gate: a single shared word inside a rule
963 // about another concern is not a match.
964 let mut scored = vec![directive_chunk(
965 "csv-token-splitting",
966 "Split each CSV row into fields on the comma token boundary carefully",
967 0.12,
968 )];
969 apply_intent_alignment_gate(
970 &mut scored,
971 "validate the auth token before issuing session",
972 );
973 assert!(
974 scored.is_empty(),
975 "a one-token name-drop in an off-subject rule must be dropped, kept: {:?}",
976 scored.iter().map(|s| &s.skill_id).collect::<Vec<_>>()
977 );
978 }
979
980 #[test]
981 fn intent_gate_keeps_on_subject_rule_with_verbose_body() {
982 // No over-pruning regression: a genuinely on-subject rule whose title
983 // states the concern but whose BODY is long must still be kept. The
984 // rule-side coverage is measured against the TITLE (the core directive),
985 // so the verbose body does not dilute it below the floor.
986 let verbose_body = "When a handler receives malformed input it should return a typed \
987 error to the caller rather than calling panic!, because a panic unwinds the worker \
988 thread and takes down unrelated in-flight requests; prefer Result and propagate. \
989 See the request lifecycle docs and the error-taxonomy appendix for the full list.";
990 let mut scored = vec![ScoredRuleChunk {
991 skill_id: "validate-return-error".to_owned(),
992 content: format!(
993 "Rule ID: r\nRule Name: Validate input and return a typed error not panic\nType: correction\nTags: \n\n{verbose_body}"
994 ),
995 score: 0.12,
996 confidence: 0.7,
997 }];
998 apply_intent_alignment_gate(
999 &mut scored,
1000 "validate input and return error instead of panic",
1001 );
1002 assert_eq!(
1003 scored
1004 .iter()
1005 .map(|s| s.skill_id.as_str())
1006 .collect::<Vec<_>>(),
1007 vec!["validate-return-error"],
1008 "an on-subject rule with a long body must survive (title-scoped coverage)"
1009 );
1010 }
1011
1012 #[test]
1013 fn intent_gate_strictly_subsumes_old_overlap_count_on_anchor_only_match() {
1014 // Anchor-only overlap is rejected, while the distinctive-token sibling
1015 // is kept. Both share the same raw term count; only distinctiveness and
1016 // rule-side coverage separate them.
1017 let intent = "panic on invalid input";
1018 // overlap = {panic(g), input(g)} = 2, distinctive = 0 → DROP under new gate.
1019 let mut anchor_only = vec![directive_chunk(
1020 "anchor-only",
1021 "Buffer every panic and input event into the queue",
1022 0.12,
1023 )];
1024 apply_intent_alignment_gate(&mut anchor_only, intent);
1025 assert!(
1026 anchor_only.is_empty(),
1027 "anchor-only overlap (old gate would keep) must now drop"
1028 );
1029 // overlap = {panic(g), invalid(d)} ⊇ the subject; distinctive = 1 → KEEP.
1030 let mut on_subject = vec![directive_chunk(
1031 "on-subject",
1032 "Reject invalid input instead of letting it panic",
1033 0.12,
1034 )];
1035 apply_intent_alignment_gate(&mut on_subject, intent);
1036 assert_eq!(
1037 on_subject
1038 .iter()
1039 .map(|s| s.skill_id.as_str())
1040 .collect::<Vec<_>>(),
1041 vec!["on-subject"],
1042 "the distinctive-token sibling must be kept"
1043 );
1044 }
1045
1046 #[test]
1047 fn intent_alignment_exempt_score_sits_above_strong_band_below_exact_title() {
1048 let exempt_score = std::hint::black_box(INTENT_ALIGNMENT_EXEMPT_SCORE);
1049 let explicit_floor = std::hint::black_box(EXPLICIT_RECALL_MIN_RELEVANCE);
1050 let exact_title_floor = std::hint::black_box(2.0);
1051 let min_overlap = std::hint::black_box(MIN_INTENT_DIRECTIVE_OVERLAP);
1052
1053 assert!(
1054 exempt_score > explicit_floor,
1055 "exemption ceiling must be above the explicit relevance floor"
1056 );
1057 assert!(
1058 exempt_score < exact_title_floor,
1059 "exemption ceiling must be below the exact-title-strict (2.0 + conf) band"
1060 );
1061 assert!(
1062 min_overlap >= 2,
1063 "a lone topical-anchor overlap must be insufficient"
1064 );
1065 }
1066
1067 #[test]
1068 fn explicit_recall_floors_are_conservative_relative_to_in_retrieval_gates() {
1069 let explicit_relative_floor = std::hint::black_box(EXPLICIT_RECALL_RELATIVE_FLOOR);
1070 let retrieval_relative_floor = std::hint::black_box(RELATIVE_RELEVANCE_FLOOR);
1071 let explicit_min = std::hint::black_box(EXPLICIT_RECALL_MIN_RELEVANCE);
1072 let adaptive_threshold = std::hint::black_box(ADAPTIVE_INJECT_THRESHOLD);
1073 let min_relevance = std::hint::black_box(MIN_RELEVANCE_SCORE);
1074
1075 assert!(
1076 explicit_relative_floor < retrieval_relative_floor,
1077 "explicit relative floor must be looser than the in-retrieval one"
1078 );
1079 assert!(
1080 explicit_min > adaptive_threshold,
1081 "explicit absolute floor must sit above the hook zero-inject threshold"
1082 );
1083 assert!(
1084 explicit_min > min_relevance,
1085 "explicit absolute floor must be stricter than the bare RRF noise floor"
1086 );
1087 }
1088}