Skip to main content

bones_search/fusion/
scoring.rs

1//! Duplicate risk scoring via Reciprocal Rank Fusion (RRF).
2//!
3//! This module implements RRF to fuse ranked lists from multiple search signals
4//! (lexical, semantic, structural) into a composite duplicate risk score, then
5//! maps that score to a discrete risk classification.
6//!
7//! # Algorithm Overview
8//!
9//! **Reciprocal Rank Fusion (RRF)** combines multiple ranked lists by scoring
10//! each item based on its position in each list:
11//!
12//! ```text
13//! RRF score = sum over all lists of: 1 / (k + rank_in_list)
14//! ```
15//!
16//! Where:
17//! - `k` is a constant (default 60) that reduces the impact of high ranks.
18//! - Items absent from a list contribute 0 to the sum.
19//! - Results are sorted by composite score descending.
20//!
21//! # Risk Classification
22//!
23//! The composite RRF score is mapped to a categorical risk level:
24//!
25//! | Score Range       | Classification     |
26//! |-------------------|--------------------|
27//! | >= 0.90           | `LikelyDuplicate`    |
28//! | 0.70..0.89        | `PossiblyRelated`    |
29//! | 0.50..0.69        | `MaybeRelated`       |
30//! | < 0.50            | None               |
31//!
32//! Thresholds are configurable via project config (`.bones/config.toml`).
33//!
34//! # Example
35//!
36//! ```ignore
37//! use bones_search::fusion::scoring::{rrf_fuse, classify_risk, DuplicateRisk};
38//! use bones_core::config::SearchConfig;
39//!
40//! let lexical_ranked = vec!["bn-001", "bn-002"];
41//! let semantic_ranked = vec!["bn-002", "bn-001", "bn-003"];
42//! let structural_ranked = vec!["bn-001"];
43//!
44//! // Fuse into composite scores
45//! let fused = rrf_fuse(&lexical_ranked, &semantic_ranked, &structural_ranked, 60);
46//!
47//! // Classify a score
48//! let config = SearchConfig::default();
49//! let risk = classify_risk(0.85, &config);
50//! assert_eq!(risk, DuplicateRisk::PossiblyRelated);
51//! ```
52
53use serde::{Deserialize, Serialize};
54use std::collections::BTreeMap;
55
56/// Risk classification for a candidate duplicate pair.
57///
58/// Based on the composite RRF score, candidates are classified into one of
59/// four risk levels. These determine how prominently the candidate is
60/// presented to the user and whether automated warnings are triggered.
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum DuplicateRisk {
64    /// Fused score >= 0.90 — almost certainly the same item.
65    ///
66    /// Automatic warnings on create, suggestion to merge.
67    LikelyDuplicate,
68
69    /// Fused score 0.70..0.89 — strong overlap, worth reviewing.
70    ///
71    /// Shown prominently in search results.
72    PossiblyRelated,
73
74    /// Fused score 0.50..0.69 — some similarity, lower confidence.
75    ///
76    /// Shown in extended results.
77    MaybeRelated,
78
79    /// Fused score < 0.50 — not considered a duplicate.
80    ///
81    /// Not displayed in duplicate context, though may appear in other searches.
82    None,
83}
84
85/// A single duplicate candidate with full scoring breakdown.
86///
87/// Includes the composite RRF-fused score, per-layer rank positions for
88/// explainability, and the final risk classification.
89#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
90pub struct DupCandidate {
91    /// The item ID of the candidate (e.g., `"bn-001"`).
92    pub item_id: String,
93
94    /// Final RRF-fused composite score in [0, 1] range.
95    ///
96    /// Higher scores indicate greater confidence in duplicate/relatedness.
97    pub composite_score: f32,
98
99    /// Rank position in the lexical (FTS5) search results (1-indexed).
100    ///
101    /// `usize::MAX` if the item did not appear in lexical results.
102    pub lexical_rank: usize,
103
104    /// Rank position in the semantic (KNN) search results (1-indexed).
105    ///
106    /// `usize::MAX` if the item did not appear in semantic results.
107    pub semantic_rank: usize,
108
109    /// Rank position in the structural similarity results (1-indexed).
110    ///
111    /// `usize::MAX` if the item did not appear in structural results.
112    pub structural_rank: usize,
113
114    /// Classification based on `composite_score` vs configured thresholds.
115    pub risk: DuplicateRisk,
116}
117
118// ---------------------------------------------------------------------------
119// RRF Fusion
120// ---------------------------------------------------------------------------
121
122/// Reciprocal Rank Fusion: merge ranked lists from multiple signals.
123///
124/// Fuses lexical, semantic, and structural ranked lists using RRF to produce
125/// a composite score for each item. Items are scored based on their positions
126/// in the input lists; items absent from a list contribute 0.
127///
128/// # Parameters
129///
130/// - `lexical` — Ranked item IDs from lexical (FTS5) search, best first.
131/// - `semantic` — Ranked item IDs from semantic (KNN) search.
132/// - `structural` — Ranked item IDs from structural similarity search.
133/// - `k` — RRF constant (e.g., 60). Higher values reduce rank impact.
134///
135/// # Returns
136///
137/// A vector of `(item_id, composite_score)` sorted by score descending.
138///
139/// # Algorithm
140///
141/// For each unique item across all lists:
142/// ```text
143/// rrf_score = sum(1 / (k + rank_in_list) for each list where item appears)
144/// ```
145/// Ranks are 1-indexed; absent items contribute 0.
146///
147/// # Example
148///
149/// ```
150/// use bones_search::fusion::scoring::rrf_fuse;
151///
152/// let lex = vec!["bn-001", "bn-002"];
153/// let sem = vec!["bn-002", "bn-001"];
154/// let str = vec!["bn-001"];
155/// let result = rrf_fuse(&lex, &sem, &str, 60);
156/// assert!(!result.is_empty());
157/// ```
158#[must_use]
159#[allow(clippy::cast_precision_loss)]
160pub fn rrf_fuse(
161    lexical: &[&str],
162    semantic: &[&str],
163    structural: &[&str],
164    k: usize,
165) -> Vec<(String, f32)> {
166    let mut scores: BTreeMap<String, f32> = BTreeMap::new();
167
168    // Process lexical ranks
169    for (idx, item_id) in lexical.iter().enumerate() {
170        let rank = idx + 1; // 1-indexed
171        let contribution = 1.0 / (k as f32 + rank as f32);
172        scores
173            .entry(item_id.to_string())
174            .and_modify(|s| *s += contribution)
175            .or_insert(contribution);
176    }
177
178    // Process semantic ranks
179    for (idx, item_id) in semantic.iter().enumerate() {
180        let rank = idx + 1;
181        let contribution = 1.0 / (k as f32 + rank as f32);
182        scores
183            .entry(item_id.to_string())
184            .and_modify(|s| *s += contribution)
185            .or_insert(contribution);
186    }
187
188    // Process structural ranks
189    for (idx, item_id) in structural.iter().enumerate() {
190        let rank = idx + 1;
191        let contribution = 1.0 / (k as f32 + rank as f32);
192        scores
193            .entry(item_id.to_string())
194            .and_modify(|s| *s += contribution)
195            .or_insert(contribution);
196    }
197
198    // Sort by score descending, then by item_id for stability
199    let mut result: Vec<_> = scores.into_iter().collect();
200    result.sort_by(|a, b| {
201        b.1.partial_cmp(&a.1)
202            .unwrap_or(std::cmp::Ordering::Equal)
203            .then_with(|| a.0.cmp(&b.0))
204    });
205
206    result
207}
208
209/// Build a ranked list of `DupCandidate` items with classification and rank metadata.
210///
211/// Takes the fused RRF scores and wraps them in `DupCandidate` structs that include
212/// per-layer rank positions for explainability. The input lists are searched to find
213/// each item's rank in each layer.
214///
215/// # Parameters
216///
217/// - `fused` — Pre-sorted list of `(item_id, composite_score)` from `rrf_fuse`.
218/// - `lexical` — Original ranked list from lexical search (for rank lookup).
219/// - `semantic` — Original ranked list from semantic search.
220/// - `structural` — Original ranked list from structural search.
221/// - `config` — Search configuration with threshold values.
222///
223/// # Returns
224///
225/// A vector of `DupCandidate` structs in the same order as `fused` (descending score).
226///
227/// # Example
228///
229/// ```ignore
230/// use bones_search::fusion::scoring::{rrf_fuse, build_dup_candidates};
231/// use bones_core::config::SearchConfig;
232///
233/// let lex = vec!["bn-001", "bn-002"];
234/// let sem = vec!["bn-002", "bn-001"];
235/// let str = vec!["bn-001"];
236/// let config = SearchConfig::default();
237///
238/// let fused = rrf_fuse(&lex, &sem, &str, config.rrf_k);
239/// let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
240///
241/// for cand in candidates {
242///     println!("{}: {} ({})", cand.item_id, cand.composite_score, cand.risk);
243/// }
244/// ```
245#[must_use]
246pub fn build_dup_candidates(
247    fused: &[(String, f32)],
248    lexical: &[&str],
249    semantic: &[&str],
250    structural: &[&str],
251    config: &SearchConfig,
252) -> Vec<DupCandidate> {
253    let mut candidates = Vec::with_capacity(fused.len());
254
255    let lexical_map: std::collections::HashMap<&str, usize> = lexical
256        .iter()
257        .enumerate()
258        .map(|(i, &id)| (id, i + 1))
259        .collect();
260    let semantic_map: std::collections::HashMap<&str, usize> = semantic
261        .iter()
262        .enumerate()
263        .map(|(i, &id)| (id, i + 1))
264        .collect();
265    let structural_map: std::collections::HashMap<&str, usize> = structural
266        .iter()
267        .enumerate()
268        .map(|(i, &id)| (id, i + 1))
269        .collect();
270
271    for (item_id, composite_score) in fused {
272        let lexical_rank = lexical_map
273            .get(item_id.as_str())
274            .copied()
275            .unwrap_or(usize::MAX);
276        let semantic_rank = semantic_map
277            .get(item_id.as_str())
278            .copied()
279            .unwrap_or(usize::MAX);
280        let structural_rank = structural_map
281            .get(item_id.as_str())
282            .copied()
283            .unwrap_or(usize::MAX);
284
285        let risk = classify_risk(*composite_score, config);
286
287        candidates.push(DupCandidate {
288            item_id: item_id.clone(),
289            composite_score: *composite_score,
290            lexical_rank,
291            semantic_rank,
292            structural_rank,
293            risk,
294        });
295    }
296
297    candidates
298}
299
300// ---------------------------------------------------------------------------
301// Risk Classification
302// ---------------------------------------------------------------------------
303
304/// Map a fused RRF score to a `DuplicateRisk` classification.
305///
306/// Classification boundaries are configurable via `SearchConfig` but typically:
307/// - >= 0.90: `LikelyDuplicate`
308/// - 0.70..0.89: `PossiblyRelated`
309/// - 0.50..0.69: `MaybeRelated`
310/// - < 0.50: `None`
311///
312/// # Parameters
313///
314/// - `score` — Composite RRF score (typically in [0, 1], but unbounded in principle).
315/// - `config` — Search configuration with threshold values.
316///
317/// # Returns
318///
319/// The appropriate `DuplicateRisk` variant for this score.
320///
321/// # Example
322///
323/// ```ignore
324/// use bones_search::fusion::scoring::classify_risk;
325/// use bones_core::config::SearchConfig;
326///
327/// let config = SearchConfig::default();
328/// assert_eq!(classify_risk(0.95, &config), DuplicateRisk::LikelyDuplicate);
329/// assert_eq!(classify_risk(0.75, &config), DuplicateRisk::PossiblyRelated);
330/// ```
331#[must_use]
332pub fn classify_risk(score: f32, config: &SearchConfig) -> DuplicateRisk {
333    if score >= config.likely_duplicate_threshold {
334        DuplicateRisk::LikelyDuplicate
335    } else if score >= config.possibly_related_threshold {
336        DuplicateRisk::PossiblyRelated
337    } else if score >= config.maybe_related_threshold {
338        DuplicateRisk::MaybeRelated
339    } else {
340        DuplicateRisk::None
341    }
342}
343
344/// Configuration for search/fusion thresholds.
345///
346/// Loaded from `.bones/config.toml` under the `[search]` section.
347/// All threshold values are in [0, 1] range.
348#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
349pub struct SearchConfig {
350    /// RRF constant; higher values reduce the impact of high ranks.
351    #[serde(default = "default_rrf_k")]
352    pub rrf_k: usize,
353
354    /// Score threshold for `LikelyDuplicate` classification (default 0.90).
355    #[serde(default = "default_likely_duplicate_threshold")]
356    pub likely_duplicate_threshold: f32,
357
358    /// Score threshold for `PossiblyRelated` classification (default 0.70).
359    #[serde(default = "default_possibly_related_threshold")]
360    pub possibly_related_threshold: f32,
361
362    /// Score threshold for `MaybeRelated` classification (default 0.50).
363    #[serde(default = "default_maybe_related_threshold")]
364    pub maybe_related_threshold: f32,
365}
366
367impl Default for SearchConfig {
368    fn default() -> Self {
369        Self {
370            rrf_k: default_rrf_k(),
371            likely_duplicate_threshold: default_likely_duplicate_threshold(),
372            possibly_related_threshold: default_possibly_related_threshold(),
373            maybe_related_threshold: default_maybe_related_threshold(),
374        }
375    }
376}
377
378const fn default_rrf_k() -> usize {
379    60
380}
381
382const fn default_likely_duplicate_threshold() -> f32 {
383    0.90
384}
385
386const fn default_possibly_related_threshold() -> f32 {
387    0.70
388}
389
390const fn default_maybe_related_threshold() -> f32 {
391    0.50
392}
393
394// ---------------------------------------------------------------------------
395// Tests
396// ---------------------------------------------------------------------------
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    // -----------------------------------------------------------------------
403    // DuplicateRisk
404    // -----------------------------------------------------------------------
405
406    #[test]
407    fn duplicate_risk_eq() {
408        assert_eq!(
409            DuplicateRisk::LikelyDuplicate,
410            DuplicateRisk::LikelyDuplicate
411        );
412        assert_ne!(
413            DuplicateRisk::LikelyDuplicate,
414            DuplicateRisk::PossiblyRelated
415        );
416    }
417
418    // -----------------------------------------------------------------------
419    // DupCandidate
420    // -----------------------------------------------------------------------
421
422    #[test]
423    fn dup_candidate_fields() {
424        let cand = DupCandidate {
425            item_id: "bn-001".into(),
426            composite_score: 0.85,
427            lexical_rank: 1,
428            semantic_rank: 2,
429            structural_rank: usize::MAX,
430            risk: DuplicateRisk::PossiblyRelated,
431        };
432
433        assert_eq!(cand.item_id, "bn-001");
434        assert!((cand.composite_score - 0.85).abs() < 1e-6);
435        assert_eq!(cand.lexical_rank, 1);
436        assert_eq!(cand.semantic_rank, 2);
437        assert_eq!(cand.structural_rank, usize::MAX);
438        assert_eq!(cand.risk, DuplicateRisk::PossiblyRelated);
439    }
440
441    #[test]
442    fn dup_candidate_clone_eq() {
443        let cand = DupCandidate {
444            item_id: "bn-001".into(),
445            composite_score: 0.75,
446            lexical_rank: 1,
447            semantic_rank: usize::MAX,
448            structural_rank: 3,
449            risk: DuplicateRisk::MaybeRelated,
450        };
451
452        let cand2 = cand.clone();
453        assert_eq!(cand, cand2);
454    }
455
456    // -----------------------------------------------------------------------
457    // rrf_fuse
458    // -----------------------------------------------------------------------
459
460    #[test]
461    fn rrf_fuse_empty_lists() {
462        let result = rrf_fuse(&[], &[], &[], 60);
463        assert!(result.is_empty());
464    }
465
466    #[test]
467    fn rrf_fuse_single_item_all_lists() {
468        let lex = vec!["bn-001"];
469        let sem = vec!["bn-001"];
470        let str = vec!["bn-001"];
471        let result = rrf_fuse(&lex, &sem, &str, 60);
472
473        assert_eq!(result.len(), 1);
474        assert_eq!(result[0].0, "bn-001");
475
476        // Score = 1/(60+1) + 1/(60+1) + 1/(60+1) = 3/61 ≈ 0.0492
477        let expected_score = 3.0 / 61.0;
478        assert!((result[0].1 - expected_score).abs() < 1e-6);
479    }
480
481    #[test]
482    fn rrf_fuse_different_lists() {
483        // Lexical: [bn-001 (rank 1), bn-002 (rank 2)]
484        // Semantic: [bn-002 (rank 1), bn-001 (rank 2)]
485        // Structural: [bn-001 (rank 1)]
486        let lex = vec!["bn-001", "bn-002"];
487        let sem = vec!["bn-002", "bn-001"];
488        let str = vec!["bn-001"];
489
490        let result = rrf_fuse(&lex, &sem, &str, 60);
491
492        // Both items should be present
493        assert_eq!(result.len(), 2);
494
495        // bn-001: 1/61 + 1/62 + 1/61 ≈ 0.0493 + 0.0161 + 0.0164 ≈ 0.0818
496        let bn001_idx = result.iter().position(|(id, _)| id == "bn-001").unwrap();
497        let bn001_score = result[bn001_idx].1;
498
499        // bn-002: 1/62 + 1/61 ≈ 0.0161 + 0.0164 ≈ 0.0325
500        let bn002_idx = result.iter().position(|(id, _)| id == "bn-002").unwrap();
501        let bn002_score = result[bn002_idx].1;
502
503        // bn-001 should have higher score
504        assert!(bn001_score > bn002_score);
505
506        // Sorted descending by score
507        assert!(result[0].1 >= result[1].1);
508    }
509
510    #[test]
511    fn rrf_fuse_disjoint_lists() {
512        let lex = vec!["bn-001"];
513        let sem = vec!["bn-002"];
514        let str = vec!["bn-003"];
515
516        let result = rrf_fuse(&lex, &sem, &str, 60);
517
518        assert_eq!(result.len(), 3);
519
520        // All should have the same score (1/61 each)
521        for (_, score) in &result {
522            assert!((score - 1.0 / 61.0).abs() < 1e-6);
523        }
524    }
525
526    #[test]
527    fn rrf_fuse_stability_by_item_id() {
528        // Two items with identical RRF scores should be sorted by item_id
529        let lex = vec!["bn-002", "bn-001"];
530        let sem = vec!["bn-001", "bn-002"];
531        let str = vec![];
532
533        let result = rrf_fuse(&lex, &sem, &str, 60);
534
535        // Both have score 1/61 + 1/62, but bn-001 < bn-002 lexically
536        assert_eq!(result[0].0, "bn-001");
537        assert_eq!(result[1].0, "bn-002");
538    }
539
540    #[test]
541    fn rrf_fuse_respects_k() {
542        // Lower k increases the score impact of ranks
543        let lex = vec!["bn-001"];
544        let sem = vec![];
545        let str = vec![];
546
547        let k60 = rrf_fuse(&lex, &sem, &str, 60);
548        let k10 = rrf_fuse(&lex, &sem, &str, 10);
549
550        // With k=10: score = 1/11 ≈ 0.0909
551        // With k=60: score = 1/61 ≈ 0.0164
552        assert!(k10[0].1 > k60[0].1);
553    }
554
555    // -----------------------------------------------------------------------
556    // classify_risk
557    // -----------------------------------------------------------------------
558
559    #[test]
560    fn classify_risk_likely_duplicate() {
561        let config = SearchConfig::default();
562
563        assert_eq!(classify_risk(0.90, &config), DuplicateRisk::LikelyDuplicate);
564        assert_eq!(classify_risk(0.95, &config), DuplicateRisk::LikelyDuplicate);
565        assert_eq!(classify_risk(1.0, &config), DuplicateRisk::LikelyDuplicate);
566    }
567
568    #[test]
569    fn classify_risk_possibly_related() {
570        let config = SearchConfig::default();
571
572        assert_eq!(classify_risk(0.70, &config), DuplicateRisk::PossiblyRelated);
573        assert_eq!(classify_risk(0.80, &config), DuplicateRisk::PossiblyRelated);
574        assert_eq!(classify_risk(0.89, &config), DuplicateRisk::PossiblyRelated);
575    }
576
577    #[test]
578    fn classify_risk_maybe_related() {
579        let config = SearchConfig::default();
580
581        assert_eq!(classify_risk(0.50, &config), DuplicateRisk::MaybeRelated);
582        assert_eq!(classify_risk(0.60, &config), DuplicateRisk::MaybeRelated);
583        assert_eq!(classify_risk(0.69, &config), DuplicateRisk::MaybeRelated);
584    }
585
586    #[test]
587    fn classify_risk_none() {
588        let config = SearchConfig::default();
589
590        assert_eq!(classify_risk(0.0, &config), DuplicateRisk::None);
591        assert_eq!(classify_risk(0.25, &config), DuplicateRisk::None);
592        assert_eq!(classify_risk(0.49, &config), DuplicateRisk::None);
593    }
594
595    #[test]
596    fn classify_risk_boundary_values() {
597        let config = SearchConfig::default();
598
599        // Exactly at boundaries
600        assert_eq!(classify_risk(0.90, &config), DuplicateRisk::LikelyDuplicate);
601        assert_eq!(classify_risk(0.70, &config), DuplicateRisk::PossiblyRelated);
602        assert_eq!(classify_risk(0.50, &config), DuplicateRisk::MaybeRelated);
603
604        // Just below boundaries
605        assert_eq!(
606            classify_risk(0.89999, &config),
607            DuplicateRisk::PossiblyRelated
608        );
609        assert_eq!(classify_risk(0.69999, &config), DuplicateRisk::MaybeRelated);
610        assert_eq!(classify_risk(0.49999, &config), DuplicateRisk::None);
611    }
612
613    #[test]
614    fn classify_risk_custom_thresholds() {
615        let config = SearchConfig {
616            rrf_k: 60,
617            likely_duplicate_threshold: 0.95,
618            possibly_related_threshold: 0.75,
619            maybe_related_threshold: 0.55,
620        };
621
622        assert_eq!(classify_risk(0.95, &config), DuplicateRisk::LikelyDuplicate);
623        assert_eq!(classify_risk(0.85, &config), DuplicateRisk::PossiblyRelated);
624        assert_eq!(classify_risk(0.65, &config), DuplicateRisk::MaybeRelated);
625        assert_eq!(classify_risk(0.45, &config), DuplicateRisk::None);
626    }
627
628    // -----------------------------------------------------------------------
629    // SearchConfig
630    // -----------------------------------------------------------------------
631
632    #[test]
633    fn search_config_defaults() {
634        let config = SearchConfig::default();
635        assert_eq!(config.rrf_k, 60);
636        assert!((config.likely_duplicate_threshold - 0.90).abs() < 1e-6);
637        assert!((config.possibly_related_threshold - 0.70).abs() < 1e-6);
638        assert!((config.maybe_related_threshold - 0.50).abs() < 1e-6);
639    }
640
641    #[test]
642    fn search_config_clone_eq() {
643        let config = SearchConfig::default();
644        let config2 = config.clone();
645        assert_eq!(config, config2);
646    }
647
648    // -----------------------------------------------------------------------
649    // build_dup_candidates
650    // -----------------------------------------------------------------------
651
652    #[test]
653    fn build_dup_candidates_empty_fused() {
654        let config = SearchConfig::default();
655        let candidates = build_dup_candidates(&[], &[], &[], &[], &config);
656        assert!(candidates.is_empty());
657    }
658
659    #[test]
660    fn build_dup_candidates_rank_metadata() {
661        let config = SearchConfig::default();
662        let lex = vec!["bn-001", "bn-002"];
663        let sem = vec!["bn-002", "bn-001"];
664        let str = vec!["bn-001"];
665
666        let fused = rrf_fuse(&lex, &sem, &str, config.rrf_k);
667        let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
668
669        // bn-001 should be in position 0 with rank 1 in lex, rank 2 in sem, rank 1 in str
670        let bn001_idx = candidates
671            .iter()
672            .position(|c| c.item_id == "bn-001")
673            .unwrap();
674        let bn001 = &candidates[bn001_idx];
675        assert_eq!(bn001.lexical_rank, 1);
676        assert_eq!(bn001.semantic_rank, 2);
677        assert_eq!(bn001.structural_rank, 1);
678
679        // bn-002 should have rank 2 in lex, rank 1 in sem, absent in str
680        let bn002_idx = candidates
681            .iter()
682            .position(|c| c.item_id == "bn-002")
683            .unwrap();
684        let bn002 = &candidates[bn002_idx];
685        assert_eq!(bn002.lexical_rank, 2);
686        assert_eq!(bn002.semantic_rank, 1);
687        assert_eq!(bn002.structural_rank, usize::MAX);
688    }
689
690    #[test]
691    fn build_dup_candidates_missing_from_all_lists() {
692        let config = SearchConfig::default();
693        let lex = vec!["bn-001"];
694        let sem = vec!["bn-001"];
695        let str = vec!["bn-001"];
696
697        // Create a fused list with a synthetic item not in any input list
698        let fused = vec![("bn-001".to_string(), 0.85), ("bn-999".to_string(), 0.15)];
699        let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
700
701        assert_eq!(candidates.len(), 2);
702
703        // bn-999 should have usize::MAX for all ranks
704        let bn999 = &candidates[1];
705        assert_eq!(bn999.item_id, "bn-999");
706        assert_eq!(bn999.lexical_rank, usize::MAX);
707        assert_eq!(bn999.semantic_rank, usize::MAX);
708        assert_eq!(bn999.structural_rank, usize::MAX);
709    }
710
711    #[test]
712    fn build_dup_candidates_applies_risk_classification() {
713        let config = SearchConfig::default();
714        let lex = vec![];
715        let sem = vec![];
716        let str = vec![];
717
718        let fused = vec![
719            ("bn-likely".to_string(), 0.95),
720            ("bn-possibly".to_string(), 0.75),
721            ("bn-maybe".to_string(), 0.55),
722            ("bn-none".to_string(), 0.25),
723        ];
724
725        let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
726
727        assert_eq!(candidates[0].risk, DuplicateRisk::LikelyDuplicate);
728        assert_eq!(candidates[1].risk, DuplicateRisk::PossiblyRelated);
729        assert_eq!(candidates[2].risk, DuplicateRisk::MaybeRelated);
730        assert_eq!(candidates[3].risk, DuplicateRisk::None);
731    }
732
733    #[test]
734    fn build_dup_candidates_preserves_fused_order() {
735        let config = SearchConfig::default();
736        let lex = vec![];
737        let sem = vec![];
738        let str = vec![];
739
740        let fused = vec![
741            ("bn-a".to_string(), 0.9),
742            ("bn-b".to_string(), 0.8),
743            ("bn-c".to_string(), 0.7),
744        ];
745
746        let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
747
748        assert_eq!(candidates[0].item_id, "bn-a");
749        assert_eq!(candidates[1].item_id, "bn-b");
750        assert_eq!(candidates[2].item_id, "bn-c");
751    }
752}