1use serde::{Deserialize, Serialize};
54use std::collections::BTreeMap;
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum DuplicateRisk {
64 LikelyDuplicate,
68
69 PossiblyRelated,
73
74 MaybeRelated,
78
79 None,
83}
84
85#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
90pub struct DupCandidate {
91 pub item_id: String,
93
94 pub composite_score: f32,
98
99 pub lexical_rank: usize,
103
104 pub semantic_rank: usize,
108
109 pub structural_rank: usize,
113
114 pub risk: DuplicateRisk,
116}
117
118#[must_use]
159#[allow(clippy::cast_precision_loss)]
160pub fn rrf_fuse(
161 lexical: &[&str],
162 semantic: &[&str],
163 structural: &[&str],
164 k: usize,
165) -> Vec<(String, f32)> {
166 let mut scores: BTreeMap<String, f32> = BTreeMap::new();
167
168 for (idx, item_id) in lexical.iter().enumerate() {
170 let rank = idx + 1; let contribution = 1.0 / (k as f32 + rank as f32);
172 scores
173 .entry(item_id.to_string())
174 .and_modify(|s| *s += contribution)
175 .or_insert(contribution);
176 }
177
178 for (idx, item_id) in semantic.iter().enumerate() {
180 let rank = idx + 1;
181 let contribution = 1.0 / (k as f32 + rank as f32);
182 scores
183 .entry(item_id.to_string())
184 .and_modify(|s| *s += contribution)
185 .or_insert(contribution);
186 }
187
188 for (idx, item_id) in structural.iter().enumerate() {
190 let rank = idx + 1;
191 let contribution = 1.0 / (k as f32 + rank as f32);
192 scores
193 .entry(item_id.to_string())
194 .and_modify(|s| *s += contribution)
195 .or_insert(contribution);
196 }
197
198 let mut result: Vec<_> = scores.into_iter().collect();
200 result.sort_by(|a, b| {
201 b.1.partial_cmp(&a.1)
202 .unwrap_or(std::cmp::Ordering::Equal)
203 .then_with(|| a.0.cmp(&b.0))
204 });
205
206 result
207}
208
209#[must_use]
246pub fn build_dup_candidates(
247 fused: &[(String, f32)],
248 lexical: &[&str],
249 semantic: &[&str],
250 structural: &[&str],
251 config: &SearchConfig,
252) -> Vec<DupCandidate> {
253 let mut candidates = Vec::with_capacity(fused.len());
254
255 let lexical_map: std::collections::HashMap<&str, usize> = lexical
256 .iter()
257 .enumerate()
258 .map(|(i, &id)| (id, i + 1))
259 .collect();
260 let semantic_map: std::collections::HashMap<&str, usize> = semantic
261 .iter()
262 .enumerate()
263 .map(|(i, &id)| (id, i + 1))
264 .collect();
265 let structural_map: std::collections::HashMap<&str, usize> = structural
266 .iter()
267 .enumerate()
268 .map(|(i, &id)| (id, i + 1))
269 .collect();
270
271 for (item_id, composite_score) in fused {
272 let lexical_rank = lexical_map
273 .get(item_id.as_str())
274 .copied()
275 .unwrap_or(usize::MAX);
276 let semantic_rank = semantic_map
277 .get(item_id.as_str())
278 .copied()
279 .unwrap_or(usize::MAX);
280 let structural_rank = structural_map
281 .get(item_id.as_str())
282 .copied()
283 .unwrap_or(usize::MAX);
284
285 let risk = classify_risk(*composite_score, config);
286
287 candidates.push(DupCandidate {
288 item_id: item_id.clone(),
289 composite_score: *composite_score,
290 lexical_rank,
291 semantic_rank,
292 structural_rank,
293 risk,
294 });
295 }
296
297 candidates
298}
299
300#[must_use]
332pub fn classify_risk(score: f32, config: &SearchConfig) -> DuplicateRisk {
333 if score >= config.likely_duplicate_threshold {
334 DuplicateRisk::LikelyDuplicate
335 } else if score >= config.possibly_related_threshold {
336 DuplicateRisk::PossiblyRelated
337 } else if score >= config.maybe_related_threshold {
338 DuplicateRisk::MaybeRelated
339 } else {
340 DuplicateRisk::None
341 }
342}
343
344#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
349pub struct SearchConfig {
350 #[serde(default = "default_rrf_k")]
352 pub rrf_k: usize,
353
354 #[serde(default = "default_likely_duplicate_threshold")]
356 pub likely_duplicate_threshold: f32,
357
358 #[serde(default = "default_possibly_related_threshold")]
360 pub possibly_related_threshold: f32,
361
362 #[serde(default = "default_maybe_related_threshold")]
364 pub maybe_related_threshold: f32,
365}
366
367impl Default for SearchConfig {
368 fn default() -> Self {
369 Self {
370 rrf_k: default_rrf_k(),
371 likely_duplicate_threshold: default_likely_duplicate_threshold(),
372 possibly_related_threshold: default_possibly_related_threshold(),
373 maybe_related_threshold: default_maybe_related_threshold(),
374 }
375 }
376}
377
378const fn default_rrf_k() -> usize {
379 60
380}
381
382const fn default_likely_duplicate_threshold() -> f32 {
383 0.90
384}
385
386const fn default_possibly_related_threshold() -> f32 {
387 0.70
388}
389
390const fn default_maybe_related_threshold() -> f32 {
391 0.50
392}
393
394#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
407 fn duplicate_risk_eq() {
408 assert_eq!(
409 DuplicateRisk::LikelyDuplicate,
410 DuplicateRisk::LikelyDuplicate
411 );
412 assert_ne!(
413 DuplicateRisk::LikelyDuplicate,
414 DuplicateRisk::PossiblyRelated
415 );
416 }
417
418 #[test]
423 fn dup_candidate_fields() {
424 let cand = DupCandidate {
425 item_id: "bn-001".into(),
426 composite_score: 0.85,
427 lexical_rank: 1,
428 semantic_rank: 2,
429 structural_rank: usize::MAX,
430 risk: DuplicateRisk::PossiblyRelated,
431 };
432
433 assert_eq!(cand.item_id, "bn-001");
434 assert!((cand.composite_score - 0.85).abs() < 1e-6);
435 assert_eq!(cand.lexical_rank, 1);
436 assert_eq!(cand.semantic_rank, 2);
437 assert_eq!(cand.structural_rank, usize::MAX);
438 assert_eq!(cand.risk, DuplicateRisk::PossiblyRelated);
439 }
440
441 #[test]
442 fn dup_candidate_clone_eq() {
443 let cand = DupCandidate {
444 item_id: "bn-001".into(),
445 composite_score: 0.75,
446 lexical_rank: 1,
447 semantic_rank: usize::MAX,
448 structural_rank: 3,
449 risk: DuplicateRisk::MaybeRelated,
450 };
451
452 let cand2 = cand.clone();
453 assert_eq!(cand, cand2);
454 }
455
456 #[test]
461 fn rrf_fuse_empty_lists() {
462 let result = rrf_fuse(&[], &[], &[], 60);
463 assert!(result.is_empty());
464 }
465
466 #[test]
467 fn rrf_fuse_single_item_all_lists() {
468 let lex = vec!["bn-001"];
469 let sem = vec!["bn-001"];
470 let str = vec!["bn-001"];
471 let result = rrf_fuse(&lex, &sem, &str, 60);
472
473 assert_eq!(result.len(), 1);
474 assert_eq!(result[0].0, "bn-001");
475
476 let expected_score = 3.0 / 61.0;
478 assert!((result[0].1 - expected_score).abs() < 1e-6);
479 }
480
481 #[test]
482 fn rrf_fuse_different_lists() {
483 let lex = vec!["bn-001", "bn-002"];
487 let sem = vec!["bn-002", "bn-001"];
488 let str = vec!["bn-001"];
489
490 let result = rrf_fuse(&lex, &sem, &str, 60);
491
492 assert_eq!(result.len(), 2);
494
495 let bn001_idx = result.iter().position(|(id, _)| id == "bn-001").unwrap();
497 let bn001_score = result[bn001_idx].1;
498
499 let bn002_idx = result.iter().position(|(id, _)| id == "bn-002").unwrap();
501 let bn002_score = result[bn002_idx].1;
502
503 assert!(bn001_score > bn002_score);
505
506 assert!(result[0].1 >= result[1].1);
508 }
509
510 #[test]
511 fn rrf_fuse_disjoint_lists() {
512 let lex = vec!["bn-001"];
513 let sem = vec!["bn-002"];
514 let str = vec!["bn-003"];
515
516 let result = rrf_fuse(&lex, &sem, &str, 60);
517
518 assert_eq!(result.len(), 3);
519
520 for (_, score) in &result {
522 assert!((score - 1.0 / 61.0).abs() < 1e-6);
523 }
524 }
525
526 #[test]
527 fn rrf_fuse_stability_by_item_id() {
528 let lex = vec!["bn-002", "bn-001"];
530 let sem = vec!["bn-001", "bn-002"];
531 let str = vec![];
532
533 let result = rrf_fuse(&lex, &sem, &str, 60);
534
535 assert_eq!(result[0].0, "bn-001");
537 assert_eq!(result[1].0, "bn-002");
538 }
539
540 #[test]
541 fn rrf_fuse_respects_k() {
542 let lex = vec!["bn-001"];
544 let sem = vec![];
545 let str = vec![];
546
547 let k60 = rrf_fuse(&lex, &sem, &str, 60);
548 let k10 = rrf_fuse(&lex, &sem, &str, 10);
549
550 assert!(k10[0].1 > k60[0].1);
553 }
554
555 #[test]
560 fn classify_risk_likely_duplicate() {
561 let config = SearchConfig::default();
562
563 assert_eq!(classify_risk(0.90, &config), DuplicateRisk::LikelyDuplicate);
564 assert_eq!(classify_risk(0.95, &config), DuplicateRisk::LikelyDuplicate);
565 assert_eq!(classify_risk(1.0, &config), DuplicateRisk::LikelyDuplicate);
566 }
567
568 #[test]
569 fn classify_risk_possibly_related() {
570 let config = SearchConfig::default();
571
572 assert_eq!(classify_risk(0.70, &config), DuplicateRisk::PossiblyRelated);
573 assert_eq!(classify_risk(0.80, &config), DuplicateRisk::PossiblyRelated);
574 assert_eq!(classify_risk(0.89, &config), DuplicateRisk::PossiblyRelated);
575 }
576
577 #[test]
578 fn classify_risk_maybe_related() {
579 let config = SearchConfig::default();
580
581 assert_eq!(classify_risk(0.50, &config), DuplicateRisk::MaybeRelated);
582 assert_eq!(classify_risk(0.60, &config), DuplicateRisk::MaybeRelated);
583 assert_eq!(classify_risk(0.69, &config), DuplicateRisk::MaybeRelated);
584 }
585
586 #[test]
587 fn classify_risk_none() {
588 let config = SearchConfig::default();
589
590 assert_eq!(classify_risk(0.0, &config), DuplicateRisk::None);
591 assert_eq!(classify_risk(0.25, &config), DuplicateRisk::None);
592 assert_eq!(classify_risk(0.49, &config), DuplicateRisk::None);
593 }
594
595 #[test]
596 fn classify_risk_boundary_values() {
597 let config = SearchConfig::default();
598
599 assert_eq!(classify_risk(0.90, &config), DuplicateRisk::LikelyDuplicate);
601 assert_eq!(classify_risk(0.70, &config), DuplicateRisk::PossiblyRelated);
602 assert_eq!(classify_risk(0.50, &config), DuplicateRisk::MaybeRelated);
603
604 assert_eq!(
606 classify_risk(0.89999, &config),
607 DuplicateRisk::PossiblyRelated
608 );
609 assert_eq!(classify_risk(0.69999, &config), DuplicateRisk::MaybeRelated);
610 assert_eq!(classify_risk(0.49999, &config), DuplicateRisk::None);
611 }
612
613 #[test]
614 fn classify_risk_custom_thresholds() {
615 let config = SearchConfig {
616 rrf_k: 60,
617 likely_duplicate_threshold: 0.95,
618 possibly_related_threshold: 0.75,
619 maybe_related_threshold: 0.55,
620 };
621
622 assert_eq!(classify_risk(0.95, &config), DuplicateRisk::LikelyDuplicate);
623 assert_eq!(classify_risk(0.85, &config), DuplicateRisk::PossiblyRelated);
624 assert_eq!(classify_risk(0.65, &config), DuplicateRisk::MaybeRelated);
625 assert_eq!(classify_risk(0.45, &config), DuplicateRisk::None);
626 }
627
628 #[test]
633 fn search_config_defaults() {
634 let config = SearchConfig::default();
635 assert_eq!(config.rrf_k, 60);
636 assert!((config.likely_duplicate_threshold - 0.90).abs() < 1e-6);
637 assert!((config.possibly_related_threshold - 0.70).abs() < 1e-6);
638 assert!((config.maybe_related_threshold - 0.50).abs() < 1e-6);
639 }
640
641 #[test]
642 fn search_config_clone_eq() {
643 let config = SearchConfig::default();
644 let config2 = config.clone();
645 assert_eq!(config, config2);
646 }
647
648 #[test]
653 fn build_dup_candidates_empty_fused() {
654 let config = SearchConfig::default();
655 let candidates = build_dup_candidates(&[], &[], &[], &[], &config);
656 assert!(candidates.is_empty());
657 }
658
659 #[test]
660 fn build_dup_candidates_rank_metadata() {
661 let config = SearchConfig::default();
662 let lex = vec!["bn-001", "bn-002"];
663 let sem = vec!["bn-002", "bn-001"];
664 let str = vec!["bn-001"];
665
666 let fused = rrf_fuse(&lex, &sem, &str, config.rrf_k);
667 let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
668
669 let bn001_idx = candidates
671 .iter()
672 .position(|c| c.item_id == "bn-001")
673 .unwrap();
674 let bn001 = &candidates[bn001_idx];
675 assert_eq!(bn001.lexical_rank, 1);
676 assert_eq!(bn001.semantic_rank, 2);
677 assert_eq!(bn001.structural_rank, 1);
678
679 let bn002_idx = candidates
681 .iter()
682 .position(|c| c.item_id == "bn-002")
683 .unwrap();
684 let bn002 = &candidates[bn002_idx];
685 assert_eq!(bn002.lexical_rank, 2);
686 assert_eq!(bn002.semantic_rank, 1);
687 assert_eq!(bn002.structural_rank, usize::MAX);
688 }
689
690 #[test]
691 fn build_dup_candidates_missing_from_all_lists() {
692 let config = SearchConfig::default();
693 let lex = vec!["bn-001"];
694 let sem = vec!["bn-001"];
695 let str = vec!["bn-001"];
696
697 let fused = vec![("bn-001".to_string(), 0.85), ("bn-999".to_string(), 0.15)];
699 let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
700
701 assert_eq!(candidates.len(), 2);
702
703 let bn999 = &candidates[1];
705 assert_eq!(bn999.item_id, "bn-999");
706 assert_eq!(bn999.lexical_rank, usize::MAX);
707 assert_eq!(bn999.semantic_rank, usize::MAX);
708 assert_eq!(bn999.structural_rank, usize::MAX);
709 }
710
711 #[test]
712 fn build_dup_candidates_applies_risk_classification() {
713 let config = SearchConfig::default();
714 let lex = vec![];
715 let sem = vec![];
716 let str = vec![];
717
718 let fused = vec![
719 ("bn-likely".to_string(), 0.95),
720 ("bn-possibly".to_string(), 0.75),
721 ("bn-maybe".to_string(), 0.55),
722 ("bn-none".to_string(), 0.25),
723 ];
724
725 let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
726
727 assert_eq!(candidates[0].risk, DuplicateRisk::LikelyDuplicate);
728 assert_eq!(candidates[1].risk, DuplicateRisk::PossiblyRelated);
729 assert_eq!(candidates[2].risk, DuplicateRisk::MaybeRelated);
730 assert_eq!(candidates[3].risk, DuplicateRisk::None);
731 }
732
733 #[test]
734 fn build_dup_candidates_preserves_fused_order() {
735 let config = SearchConfig::default();
736 let lex = vec![];
737 let sem = vec![];
738 let str = vec![];
739
740 let fused = vec![
741 ("bn-a".to_string(), 0.9),
742 ("bn-b".to_string(), 0.8),
743 ("bn-c".to_string(), 0.7),
744 ];
745
746 let candidates = build_dup_candidates(&fused, &lex, &sem, &str, &config);
747
748 assert_eq!(candidates[0].item_id, "bn-a");
749 assert_eq!(candidates[1].item_id, "bn-b");
750 assert_eq!(candidates[2].item_id, "bn-c");
751 }
752}