1use crate::catalog::index::CandidateFinder;
2use crate::catalog::store::ReferenceCatalog;
3use crate::core::header::QueryHeader;
4use crate::core::reference::KnownReference;
5use crate::matching::diagnosis::MatchDiagnosis;
6use crate::matching::scoring::MatchScore;
7
8#[derive(Debug, Clone)]
10pub struct MatchResult {
11 pub reference: KnownReference,
13
14 pub score: MatchScore,
16
17 pub diagnosis: MatchDiagnosis,
19}
20
21impl MatchResult {
22 #[must_use]
23 pub fn new(reference: &KnownReference, query: &QueryHeader, weights: &ScoringWeights) -> Self {
24 let score = MatchScore::calculate_with_weights(query, reference, weights);
25 let diagnosis = MatchDiagnosis::analyze(query, reference);
26
27 Self {
28 reference: reference.clone(),
29 score,
30 diagnosis,
31 }
32 }
33}
34
35pub const DEFAULT_MIN_SCORE: f64 = 0.1;
37
38#[derive(Debug, Clone)]
40pub struct MatchingConfig {
41 pub min_score: f64,
43 pub scoring_weights: ScoringWeights,
45}
46
47impl Default for MatchingConfig {
48 fn default() -> Self {
49 Self {
50 min_score: DEFAULT_MIN_SCORE,
51 scoring_weights: ScoringWeights::default(),
52 }
53 }
54}
55
56#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
66pub struct ScoringWeights {
67 #[serde(default = "default_contig_match")]
69 pub contig_match: f64,
70
71 #[serde(default = "default_coverage")]
73 pub coverage: f64,
74
75 #[serde(default = "default_order")]
77 pub order: f64,
78
79 #[serde(default = "default_conflict_penalty")]
82 pub conflict_penalty: f64,
83}
84
85fn default_contig_match() -> f64 {
86 0.70
87}
88fn default_coverage() -> f64 {
89 0.20
90}
91fn default_order() -> f64 {
92 0.10
93}
94fn default_conflict_penalty() -> f64 {
95 0.1
96}
97
98impl Default for ScoringWeights {
99 fn default() -> Self {
100 Self {
101 contig_match: 0.70, coverage: 0.20, order: 0.10, conflict_penalty: 0.1, }
106 }
107}
108
109impl ScoringWeights {
110 #[must_use]
113 pub fn normalized(&self) -> Self {
114 let total = self.contig_match + self.coverage + self.order;
115
116 if total <= 0.0 {
117 return Self::default();
118 }
119
120 Self {
121 contig_match: self.contig_match / total,
122 coverage: self.coverage / total,
123 order: self.order / total,
124 conflict_penalty: self.conflict_penalty.clamp(0.0, 1.0),
125 }
126 }
127}
128
129pub struct MatchingEngine<'a> {
131 catalog: &'a ReferenceCatalog,
132 config: MatchingConfig,
134}
135
136impl<'a> MatchingEngine<'a> {
137 #[must_use]
139 pub fn new(catalog: &'a ReferenceCatalog, config: MatchingConfig) -> Self {
140 Self { catalog, config }
141 }
142
143 #[must_use]
145 pub fn find_matches(&self, query: &QueryHeader, limit: usize) -> Vec<MatchResult> {
146 if let Some(sig) = &query.signature {
148 if let Some(reference) = self.catalog.find_by_signature(sig) {
149 return vec![MatchResult::new(
150 reference,
151 query,
152 &self.config.scoring_weights,
153 )];
154 }
155 }
156
157 let finder = CandidateFinder::new(self.catalog);
159 let candidate_indices = finder.find_top_candidates(query, limit * 2);
160
161 let mut results: Vec<MatchResult> = candidate_indices
163 .into_iter()
164 .map(|idx| {
165 let reference = &self.catalog.references[idx];
166 MatchResult::new(reference, query, &self.config.scoring_weights)
167 })
168 .collect();
169
170 results.sort_by(|a, b| {
172 b.score
173 .composite
174 .partial_cmp(&a.score.composite)
175 .unwrap_or(std::cmp::Ordering::Equal)
176 });
177
178 results
180 .into_iter()
181 .filter(|r| r.score.composite > self.config.min_score)
182 .take(limit)
183 .collect()
184 }
185
186 #[cfg(test)]
188 #[must_use]
189 pub fn find_best_match(&self, query: &QueryHeader) -> Option<MatchResult> {
190 self.find_matches(query, 1).into_iter().next()
191 }
192}
193
194#[cfg(test)]
195mod tests {
196 use super::*;
197 use crate::core::contig::Contig;
198
199 fn make_test_catalog() -> ReferenceCatalog {
200 ReferenceCatalog::load_embedded().unwrap()
201 }
202
203 #[test]
204 fn test_find_matches_grch38() {
205 let catalog = make_test_catalog();
206 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
207
208 let contigs = vec![
210 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
211 Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
212 Contig::new("chr3", 198_295_559).with_md5("76635a41ea913a405ded820447d067b0"),
213 ];
214 let query = QueryHeader::new(contigs);
215
216 let matches = engine.find_matches(&query, 5);
217 assert!(!matches.is_empty());
218
219 let best = &matches[0];
221 assert!(
222 best.reference.display_name.contains("38")
223 || best.reference.display_name.contains("hg38"),
224 "Expected GRCh38 match, got: {}",
225 best.reference.display_name
226 );
227 }
228
229 #[test]
230 fn test_find_matches_grch37() {
231 let catalog = make_test_catalog();
232 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
233
234 let contigs = vec![
236 Contig::new("chr1", 249_250_621).with_md5("1b22b98cdeb4a9304cb5d48026a85128"),
237 Contig::new("chr2", 243_199_373).with_md5("a0d9851da00400dec1098a9255ac712e"),
238 ];
239 let query = QueryHeader::new(contigs);
240
241 let matches = engine.find_matches(&query, 5);
242 assert!(!matches.is_empty());
243
244 let best = &matches[0];
246 assert!(
247 best.reference.display_name.contains("37")
248 || best.reference.display_name.contains("hg19")
249 || best.reference.display_name.contains("b37"),
250 "Expected GRCh37 match, got: {}",
251 best.reference.display_name
252 );
253 }
254
255 #[test]
256 fn test_find_matches_ncbi_naming() {
257 let catalog = make_test_catalog();
258 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
259
260 let contigs = vec![
262 Contig::new("1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
263 Contig::new("2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
264 ];
265 let query = QueryHeader::new(contigs);
266
267 let matches = engine.find_matches(&query, 5);
268 assert!(!matches.is_empty());
269
270 let best = &matches[0];
272 assert!(best.score.composite > 0.1); assert!(
274 best.reference.display_name.contains("38")
275 || best.reference.display_name.contains("hs38"),
276 "Expected GRCh38 match, got: {}",
277 best.reference.display_name
278 );
279 }
280
281 #[test]
282 fn test_find_matches_no_md5() {
283 let catalog = make_test_catalog();
284 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
285
286 let contigs = vec![
288 Contig::new("chr1", 248_956_422),
289 Contig::new("chr2", 242_193_529),
290 Contig::new("chr3", 198_295_559),
291 ];
292 let query = QueryHeader::new(contigs);
293
294 let matches = engine.find_matches(&query, 5);
296 assert!(!matches.is_empty());
297 }
298
299 #[test]
300 fn test_find_best_match() {
301 let catalog = make_test_catalog();
302 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
303
304 let contigs =
305 vec![Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")];
306 let query = QueryHeader::new(contigs);
307
308 let best = engine.find_best_match(&query);
309 assert!(best.is_some());
310 }
311
312 #[test]
313 fn test_no_matches_for_invalid_contigs() {
314 let catalog = make_test_catalog();
315 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
316
317 let contigs = vec![
319 Contig::new("invalid_contig", 12345),
320 Contig::new("another_invalid", 67890),
321 ];
322 let query = QueryHeader::new(contigs);
323
324 let matches = engine.find_matches(&query, 5);
325 assert!(matches.is_empty() || matches[0].score.composite < 0.2);
327 }
328
329 #[test]
334 fn test_ucsc_style_fix_patch_matching() {
335 let catalog = make_test_catalog();
338 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
339
340 let contigs = vec![
343 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
345 Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
346 Contig::new("chr1_KN196472v1_fix", 186_494),
348 ];
349 let query = QueryHeader::new(contigs);
350
351 let matches = engine.find_matches(&query, 5);
352 assert!(
353 !matches.is_empty(),
354 "Should find matches for UCSC-style patch names"
355 );
356
357 let best = &matches[0];
359 assert!(
360 best.reference.display_name.contains("38")
361 || best.reference.display_name.contains("hg38")
362 || best.reference.display_name.contains("hs38"),
363 "Expected GRCh38 match for fix-patch query, got: {}",
364 best.reference.display_name
365 );
366 }
367
368 #[test]
369 fn test_ucsc_style_novel_patch_matching() {
370 let catalog = make_test_catalog();
372 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
373
374 let contigs = vec![
376 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
377 Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
378 Contig::new("chr1_KQ458382v1_alt", 141_019),
380 ];
381 let query = QueryHeader::new(contigs);
382
383 let matches = engine.find_matches(&query, 5);
384 assert!(
385 !matches.is_empty(),
386 "Should find matches for UCSC-style alt patch names"
387 );
388
389 let best = &matches[0];
391 assert!(
392 best.reference.display_name.contains("38")
393 || best.reference.display_name.contains("hg38")
394 || best.reference.display_name.contains("hs38"),
395 "Expected GRCh38 match for novel-patch query, got: {}",
396 best.reference.display_name
397 );
398 }
399
400 #[test]
401 fn test_ucsc_style_y_chromosome_patch_matching() {
402 let catalog = make_test_catalog();
404 let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
405
406 let contigs = vec![
407 Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
408 Contig::new("chrY", 57_227_415).with_md5("ce3e31103314a704255f3cd90369ecce"),
409 Contig::new("chrY_KN196487v1_fix", 101_150),
411 ];
412 let query = QueryHeader::new(contigs);
413
414 let matches = engine.find_matches(&query, 5);
415 assert!(
416 !matches.is_empty(),
417 "Should find matches for Y chromosome fix-patch names"
418 );
419 }
420
421 #[test]
422 fn test_catalog_contains_ucsc_patch_aliases() {
423 let catalog = make_test_catalog();
427
428 let has_fix_patch_index = catalog
431 .name_length_to_refs
432 .keys()
433 .any(|(name, _)| name.contains("_fix") && name.starts_with("chr"))
434 || catalog
435 .alias_length_to_refs
436 .keys()
437 .any(|(name, _)| name.contains("_fix") && name.starts_with("chr"));
438
439 let has_alt_patch_index = catalog.name_length_to_refs.keys().any(|(name, _)| {
440 name.ends_with("_alt") && name.starts_with("chr") && name.contains('v')
441 }) || catalog.alias_length_to_refs.keys().any(|(name, _)| {
442 name.ends_with("_alt") && name.starts_with("chr") && name.contains('v')
443 });
444
445 if !has_fix_patch_index && !has_alt_patch_index {
449 let any_ref_has_patches = catalog.references.iter().any(|r| {
451 r.name_length_set.iter().any(|(name, _)| {
452 (name.contains("_fix") || name.contains("_alt"))
453 && name.starts_with("chr")
454 && name.contains('v')
455 })
456 });
457
458 assert!(
459 any_ref_has_patches,
460 "Catalog should have UCSC-style patch names indexed for matching. \
461 The matching tests pass, so this may indicate a catalog structure change."
462 );
463 }
464 }
465}