Skip to main content

ref_solver/catalog/
index.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::core::header::QueryHeader;
4
5use super::store::ReferenceCatalog;
6
7/// Finds candidate references that might match a query header
8pub struct CandidateFinder<'a> {
9    catalog: &'a ReferenceCatalog,
10}
11
12impl<'a> CandidateFinder<'a> {
13    #[must_use]
14    pub fn new(catalog: &'a ReferenceCatalog) -> Self {
15        Self { catalog }
16    }
17
18    /// Find candidate references based on MD5 overlap
19    /// Returns indices sorted by number of overlapping MD5s (descending)
20    #[must_use]
21    pub fn find_candidates_by_md5(&self, query: &QueryHeader) -> Vec<(usize, usize)> {
22        let mut ref_counts: HashMap<usize, usize> = HashMap::new();
23
24        for md5 in &query.md5_set {
25            if let Some(indices) = self.catalog.md5_to_refs.get(md5) {
26                for &idx in indices {
27                    *ref_counts.entry(idx).or_default() += 1;
28                }
29            }
30        }
31
32        let mut candidates: Vec<_> = ref_counts.into_iter().collect();
33        candidates.sort_by(|a, b| b.1.cmp(&a.1)); // Sort by count descending
34        candidates
35    }
36
37    /// Find candidates by (name, length) pairs when MD5s aren't available
38    #[must_use]
39    pub fn find_candidates_by_name_length(&self, query: &QueryHeader) -> Vec<(usize, usize)> {
40        let mut ref_counts: HashMap<usize, usize> = HashMap::new();
41
42        // Check primary names
43        for (name, length) in &query.name_length_set {
44            if let Some(indices) = self
45                .catalog
46                .name_length_to_refs
47                .get(&(name.clone(), *length))
48            {
49                for &idx in indices {
50                    *ref_counts.entry(idx).or_default() += 1;
51                }
52            }
53        }
54
55        // Also check query aliases against catalog names (reverse alias matching)
56        // This handles cases where query has aliases like NC_000001.11 that match
57        // catalog names directly
58        for (alias, length) in &query.alias_length_set {
59            if let Some(indices) = self
60                .catalog
61                .name_length_to_refs
62                .get(&(alias.clone(), *length))
63            {
64                for &idx in indices {
65                    *ref_counts.entry(idx).or_default() += 1;
66                }
67            }
68        }
69
70        let mut candidates: Vec<_> = ref_counts.into_iter().collect();
71        candidates.sort_by(|a, b| b.1.cmp(&a.1));
72        candidates
73    }
74
75    /// Get top N candidates combining MD5 and name/length matching
76    #[must_use]
77    pub fn find_top_candidates(&self, query: &QueryHeader, limit: usize) -> Vec<usize> {
78        let mut seen: HashSet<usize> = HashSet::new();
79        let mut result = Vec::new();
80
81        // First add MD5-based candidates (higher priority)
82        for (idx, _) in self.find_candidates_by_md5(query) {
83            if seen.insert(idx) {
84                result.push(idx);
85                if result.len() >= limit {
86                    return result;
87                }
88            }
89        }
90
91        // Then add name/length candidates
92        for (idx, _) in self.find_candidates_by_name_length(query) {
93            if seen.insert(idx) {
94                result.push(idx);
95                if result.len() >= limit {
96                    return result;
97                }
98            }
99        }
100
101        result
102    }
103}
104
105#[cfg(test)]
106mod tests {
107    use super::*;
108    use crate::core::contig::Contig;
109    use crate::core::reference::KnownReference;
110    use crate::core::types::{Assembly, ReferenceSource};
111
112    #[test]
113    fn test_find_candidates_via_alias() {
114        // Create a catalog with an NCBI reference that has UCSC aliases
115        let mut catalog = ReferenceCatalog::new();
116
117        let ref_contigs = vec![
118            Contig::new("NC_000001.11", 248_956_422)
119                .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
120                .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
121            Contig::new("NC_000002.12", 242_193_529)
122                .with_md5("f98db672eb0993dcfdabafe2a882905c")
123                .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
124        ];
125
126        let reference = KnownReference::new(
127            "test_ncbi_ref",
128            "Test NCBI Reference",
129            Assembly::Grch38,
130            ReferenceSource::Custom("test".to_string()),
131        )
132        .with_contigs(ref_contigs);
133
134        catalog.add_reference(reference);
135
136        // Create a query with UCSC names (no aliases, no MD5s)
137        let query = QueryHeader::new(vec![
138            Contig::new("chr1", 248_956_422),
139            Contig::new("chr2", 242_193_529),
140        ]);
141
142        // The CandidateFinder should find the NCBI reference via alias matching
143        let finder = CandidateFinder::new(&catalog);
144        let candidates = finder.find_candidates_by_name_length(&query);
145
146        assert!(
147            !candidates.is_empty(),
148            "Should find NCBI reference as candidate via UCSC aliases"
149        );
150
151        // The reference should have matched 2 contigs via alias
152        let (idx, count) = candidates[0];
153        assert_eq!(idx, 0, "Should match the first (and only) reference");
154        assert_eq!(count, 2, "Should match both chr1 and chr2 via aliases");
155    }
156
157    #[test]
158    fn test_find_candidates_real_catalog() {
159        // Test with the real embedded catalog
160        let catalog = ReferenceCatalog::load_embedded().unwrap();
161
162        // Create a query with UCSC names (matching hg38)
163        let query = QueryHeader::new(vec![
164            Contig::new("chr1", 248_956_422),
165            Contig::new("chr2", 242_193_529),
166        ]);
167
168        let finder = CandidateFinder::new(&catalog);
169        let candidates = finder.find_candidates_by_name_length(&query);
170
171        // Should find some candidates
172        assert!(
173            !candidates.is_empty(),
174            "Should find candidates for UCSC chr1/chr2 query"
175        );
176
177        // Check if grch38_v38 (p12) is among the candidates
178        let grch38_v38_idx = catalog
179            .references
180            .iter()
181            .position(|r| r.id.0 == "grch38_v38");
182
183        if let Some(expected_idx) = grch38_v38_idx {
184            let found = candidates.iter().any(|(idx, _)| *idx == expected_idx);
185            assert!(
186                found,
187                "grch38_v38 (with UCSC aliases) should be found as candidate for UCSC query"
188            );
189        }
190    }
191}