1use std::collections::HashMap;
7
8use crate::catalog::hierarchical::{CatalogIndex, HierarchicalCatalog};
9use crate::core::assembly::PresenceCounts;
10use crate::core::header::QueryHeader;
11use crate::core::types::MatchType;
12
13#[inline]
15fn count_to_f64(count: usize) -> f64 {
16 #[allow(clippy::cast_precision_loss)]
17 {
18 count as f64
19 }
20}
21
22#[derive(Debug, Clone)]
24pub struct HierarchicalMatchResult {
25 pub distribution_id: String,
27 pub display_name: String,
29 pub assembly_id: String,
31 pub assembly_name: String,
33 pub version_id: String,
35 pub version_string: String,
37 pub match_type: MatchType,
39 pub score: f64,
41 pub matched_contigs: usize,
43 pub total_query_contigs: usize,
45 pub total_distribution_contigs: usize,
47 pub presence_counts: PresenceCounts,
49 pub extra_in_query: usize,
51 pub missing_from_query: usize,
53}
54
55impl HierarchicalMatchResult {
56 #[must_use]
58 pub fn match_percentage(&self) -> f64 {
59 self.score * 100.0
60 }
61
62 #[cfg(test)]
64 #[must_use]
65 pub fn is_exact(&self) -> bool {
66 matches!(self.match_type, MatchType::Exact)
67 }
68}
69
70pub struct HierarchicalMatchingEngine<'a> {
72 catalog: &'a HierarchicalCatalog,
73 index: CatalogIndex,
74 min_score: f64,
75}
76
77impl<'a> HierarchicalMatchingEngine<'a> {
78 #[must_use]
80 pub fn new(catalog: &'a HierarchicalCatalog) -> Self {
81 let index = catalog.build_index();
82 Self {
83 catalog,
84 index,
85 min_score: 0.1,
86 }
87 }
88
89 #[must_use]
91 pub fn find_matches(&self, query: &QueryHeader, limit: usize) -> Vec<HierarchicalMatchResult> {
92 let mut scores: HashMap<String, (usize, usize)> = HashMap::new(); for contig in &query.contigs {
97 if let Some(md5) = &contig.md5 {
99 for location in self.index.find_by_md5(md5) {
100 let entry = scores
101 .entry(location.distribution_id.clone())
102 .or_insert((0, 0));
103 entry.0 += 1;
104 }
105 }
106
107 for location in self.index.find_by_name_length(&contig.name, contig.length) {
109 let entry = scores
110 .entry(location.distribution_id.clone())
111 .or_insert((0, 0));
112 entry.1 += 1;
113 }
114 }
115
116 let mut results: Vec<HierarchicalMatchResult> = scores
118 .into_iter()
119 .filter_map(|(dist_id, (md5_matches, name_len_matches))| {
120 self.build_result(&dist_id, query, md5_matches, name_len_matches)
121 })
122 .filter(|r| r.score >= self.min_score)
123 .collect();
124
125 results.sort_by(|a, b| {
127 b.score
128 .partial_cmp(&a.score)
129 .unwrap_or(std::cmp::Ordering::Equal)
130 });
131
132 results.truncate(limit);
133 results
134 }
135
136 fn build_result(
138 &self,
139 dist_id: &str,
140 query: &QueryHeader,
141 md5_matches: usize,
142 name_len_matches: usize,
143 ) -> Option<HierarchicalMatchResult> {
144 let dist_ref = self.catalog.get_distribution(dist_id)?;
145 let dist = dist_ref.distribution;
146
147 let matched = md5_matches.max(name_len_matches);
149
150 let query_size = query.contigs.len();
152 let dist_size = dist.contigs.len();
153 let union = query_size + dist_size - matched;
154 let score = if union > 0 {
155 count_to_f64(matched) / count_to_f64(union)
156 } else {
157 0.0
158 };
159
160 let match_type = if matched == query_size && matched == dist_size {
162 MatchType::Exact
163 } else if matched > 0 {
164 MatchType::Partial
165 } else {
166 MatchType::NoMatch
167 };
168
169 let presence_counts = dist.presence_counts();
170
171 Some(HierarchicalMatchResult {
172 distribution_id: dist_id.to_string(),
173 display_name: dist.display_name.clone(),
174 assembly_id: dist_ref.assembly.map(|a| a.id.clone()).unwrap_or_default(),
175 assembly_name: dist_ref
176 .assembly
177 .map(|a| a.name.clone())
178 .unwrap_or_default(),
179 version_id: dist_ref.version.map(|v| v.id.clone()).unwrap_or_default(),
180 version_string: dist_ref
181 .version
182 .map(|v| v.version.clone())
183 .unwrap_or_default(),
184 match_type,
185 score,
186 matched_contigs: matched,
187 total_query_contigs: query_size,
188 total_distribution_contigs: dist_size,
189 presence_counts,
190 extra_in_query: query_size.saturating_sub(matched),
191 missing_from_query: dist_size.saturating_sub(matched),
192 })
193 }
194}
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199 use crate::core::assembly::{
200 AssemblyVersion, FastaContig, FastaDistribution, HierarchicalAssembly, ReportSource,
201 };
202 use crate::core::contig::Contig;
203 use crate::core::types::ReferenceSource;
204
205 fn make_test_catalog() -> HierarchicalCatalog {
206 HierarchicalCatalog {
207 version: "1.0.0".to_string(),
208 created_at: "2024-01-01T00:00:00Z".to_string(),
209 assemblies: vec![HierarchicalAssembly {
210 id: "grch38".to_string(),
211 name: "GRCh38".to_string(),
212 organism: "Homo sapiens".to_string(),
213 versions: vec![AssemblyVersion {
214 id: "grch38_p14".to_string(),
215 version: "p14".to_string(),
216 source: ReportSource::Ncbi {
217 accession: "GCF_000001405.40".to_string(),
218 url: None,
219 date: None,
220 },
221 report_contigs: vec![],
222 fasta_distributions: vec![
223 FastaDistribution {
224 id: "hg38_ucsc".to_string(),
225 display_name: "hg38 (UCSC)".to_string(),
226 source: ReferenceSource::Ucsc,
227 download_url: None,
228 tags: vec![],
229 contigs: vec![
230 FastaContig::new(
231 "chr1",
232 248_956_422,
233 "6aef897c3d6ff0c78aff06ac189178dd",
234 ),
235 FastaContig::new(
236 "chr2",
237 242_193_529,
238 "f98db672eb0993dcfdabafe2a882905c",
239 ),
240 ],
241 },
242 FastaDistribution {
243 id: "hs38DH".to_string(),
244 display_name: "hs38DH (1KG)".to_string(),
245 source: ReferenceSource::OneThousandGenomes,
246 download_url: None,
247 tags: vec![],
248 contigs: vec![
249 FastaContig::new(
250 "chr1",
251 248_956_422,
252 "6aef897c3d6ff0c78aff06ac189178dd",
253 ),
254 FastaContig::new(
255 "chr2",
256 242_193_529,
257 "f98db672eb0993dcfdabafe2a882905c",
258 ),
259 FastaContig::new(
260 "chr1_decoy",
261 5000,
262 "decoy_md5_hash_here_12345678",
263 ),
264 ],
265 },
266 ],
267 }],
268 }],
269 standalone_distributions: vec![],
270 }
271 }
272
273 #[test]
274 fn test_exact_match() {
275 let catalog = make_test_catalog();
276 let engine = HierarchicalMatchingEngine::new(&catalog);
277
278 let query = QueryHeader::new(vec![
280 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
281 Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
282 ]);
283
284 let matches = engine.find_matches(&query, 5);
285 assert!(!matches.is_empty());
286
287 let hg38 = matches.iter().find(|m| m.distribution_id == "hg38_ucsc");
289 assert!(hg38.is_some());
290 let hg38 = hg38.unwrap();
291 assert!(hg38.is_exact());
292 assert_eq!(hg38.assembly_name, "GRCh38");
293 }
294
295 #[test]
296 fn test_subset_match() {
297 let catalog = make_test_catalog();
298 let engine = HierarchicalMatchingEngine::new(&catalog);
299
300 let query = QueryHeader::new(vec![
302 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")
303 ]);
304
305 let matches = engine.find_matches(&query, 5);
306 assert!(!matches.is_empty());
307
308 let best = &matches[0];
310 assert!(matches!(best.match_type, MatchType::Partial));
311 assert_eq!(best.matched_contigs, 1);
312 assert!(best.missing_from_query > 0);
313 }
314
315 #[test]
316 fn test_superset_match() {
317 let catalog = make_test_catalog();
318 let engine = HierarchicalMatchingEngine::new(&catalog);
319
320 let query = QueryHeader::new(vec![
322 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
323 Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
324 Contig::new("chr1_decoy", 5000).with_md5("decoy_md5_hash_here_12345678"),
325 ]);
326
327 let matches = engine.find_matches(&query, 5);
328 assert!(!matches.is_empty());
329
330 let hs38dh = matches.iter().find(|m| m.distribution_id == "hs38DH");
332 assert!(hs38dh.is_some());
333 assert!(hs38dh.unwrap().is_exact());
334
335 let hg38 = matches.iter().find(|m| m.distribution_id == "hg38_ucsc");
337 assert!(hg38.is_some());
338 assert!(matches!(hg38.unwrap().match_type, MatchType::Partial));
339 assert!(hg38.unwrap().extra_in_query > 0);
340 }
341
342 #[test]
343 fn test_name_length_fallback() {
344 let catalog = make_test_catalog();
345 let engine = HierarchicalMatchingEngine::new(&catalog);
346
347 let query = QueryHeader::new(vec![
349 Contig::new("chr1", 248_956_422),
350 Contig::new("chr2", 242_193_529),
351 ]);
352
353 let matches = engine.find_matches(&query, 5);
354 assert!(!matches.is_empty());
355
356 let best = &matches[0];
358 assert!(best.score > 0.5);
359 }
360
361 #[test]
362 fn test_assembly_info_populated() {
363 let catalog = make_test_catalog();
364 let engine = HierarchicalMatchingEngine::new(&catalog);
365
366 let query = QueryHeader::new(vec![
367 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")
368 ]);
369
370 let matches = engine.find_matches(&query, 5);
371 let best = &matches[0];
372
373 assert_eq!(best.assembly_id, "grch38");
374 assert_eq!(best.assembly_name, "GRCh38");
375 assert_eq!(best.version_id, "grch38_p14");
376 assert_eq!(best.version_string, "p14");
377 }
378}