ref_solver/catalog/
index.rs1use std::collections::{HashMap, HashSet};
2
3use crate::core::header::QueryHeader;
4
5use super::store::ReferenceCatalog;
6
7pub struct CandidateFinder<'a> {
9 catalog: &'a ReferenceCatalog,
10}
11
12impl<'a> CandidateFinder<'a> {
13 #[must_use]
14 pub fn new(catalog: &'a ReferenceCatalog) -> Self {
15 Self { catalog }
16 }
17
18 #[must_use]
21 pub fn find_candidates_by_md5(&self, query: &QueryHeader) -> Vec<(usize, usize)> {
22 let mut ref_counts: HashMap<usize, usize> = HashMap::new();
23
24 for md5 in &query.md5_set {
25 if let Some(indices) = self.catalog.md5_to_refs.get(md5) {
26 for &idx in indices {
27 *ref_counts.entry(idx).or_default() += 1;
28 }
29 }
30 }
31
32 let mut candidates: Vec<_> = ref_counts.into_iter().collect();
33 candidates.sort_by(|a, b| b.1.cmp(&a.1)); candidates
35 }
36
37 #[must_use]
39 pub fn find_candidates_by_name_length(&self, query: &QueryHeader) -> Vec<(usize, usize)> {
40 let mut ref_counts: HashMap<usize, usize> = HashMap::new();
41
42 for (name, length) in &query.name_length_set {
44 if let Some(indices) = self
45 .catalog
46 .name_length_to_refs
47 .get(&(name.clone(), *length))
48 {
49 for &idx in indices {
50 *ref_counts.entry(idx).or_default() += 1;
51 }
52 }
53 }
54
55 for (alias, length) in &query.alias_length_set {
59 if let Some(indices) = self
60 .catalog
61 .name_length_to_refs
62 .get(&(alias.clone(), *length))
63 {
64 for &idx in indices {
65 *ref_counts.entry(idx).or_default() += 1;
66 }
67 }
68 }
69
70 let mut candidates: Vec<_> = ref_counts.into_iter().collect();
71 candidates.sort_by(|a, b| b.1.cmp(&a.1));
72 candidates
73 }
74
75 #[must_use]
77 pub fn find_top_candidates(&self, query: &QueryHeader, limit: usize) -> Vec<usize> {
78 let mut seen: HashSet<usize> = HashSet::new();
79 let mut result = Vec::new();
80
81 for (idx, _) in self.find_candidates_by_md5(query) {
83 if seen.insert(idx) {
84 result.push(idx);
85 if result.len() >= limit {
86 return result;
87 }
88 }
89 }
90
91 for (idx, _) in self.find_candidates_by_name_length(query) {
93 if seen.insert(idx) {
94 result.push(idx);
95 if result.len() >= limit {
96 return result;
97 }
98 }
99 }
100
101 result
102 }
103}
104
105#[cfg(test)]
106mod tests {
107 use super::*;
108 use crate::core::contig::Contig;
109 use crate::core::reference::KnownReference;
110 use crate::core::types::{Assembly, ReferenceSource};
111
112 #[test]
113 fn test_find_candidates_via_alias() {
114 let mut catalog = ReferenceCatalog::new();
116
117 let ref_contigs = vec![
118 Contig::new("NC_000001.11", 248_956_422)
119 .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
120 .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
121 Contig::new("NC_000002.12", 242_193_529)
122 .with_md5("f98db672eb0993dcfdabafe2a882905c")
123 .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
124 ];
125
126 let reference = KnownReference::new(
127 "test_ncbi_ref",
128 "Test NCBI Reference",
129 Assembly::Grch38,
130 ReferenceSource::Custom("test".to_string()),
131 )
132 .with_contigs(ref_contigs);
133
134 catalog.add_reference(reference);
135
136 let query = QueryHeader::new(vec![
138 Contig::new("chr1", 248_956_422),
139 Contig::new("chr2", 242_193_529),
140 ]);
141
142 let finder = CandidateFinder::new(&catalog);
144 let candidates = finder.find_candidates_by_name_length(&query);
145
146 assert!(
147 !candidates.is_empty(),
148 "Should find NCBI reference as candidate via UCSC aliases"
149 );
150
151 let (idx, count) = candidates[0];
153 assert_eq!(idx, 0, "Should match the first (and only) reference");
154 assert_eq!(count, 2, "Should match both chr1 and chr2 via aliases");
155 }
156
157 #[test]
158 fn test_find_candidates_real_catalog() {
159 let catalog = ReferenceCatalog::load_embedded().unwrap();
161
162 let query = QueryHeader::new(vec![
164 Contig::new("chr1", 248_956_422),
165 Contig::new("chr2", 242_193_529),
166 ]);
167
168 let finder = CandidateFinder::new(&catalog);
169 let candidates = finder.find_candidates_by_name_length(&query);
170
171 assert!(
173 !candidates.is_empty(),
174 "Should find candidates for UCSC chr1/chr2 query"
175 );
176
177 let grch38_v38_idx = catalog
179 .references
180 .iter()
181 .position(|r| r.id.0 == "grch38_v38");
182
183 if let Some(expected_idx) = grch38_v38_idx {
184 let found = candidates.iter().any(|(idx, _)| *idx == expected_idx);
185 assert!(
186 found,
187 "grch38_v38 (with UCSC aliases) should be found as candidate for UCSC query"
188 );
189 }
190 }
191}