Skip to main content

ref_solver/matching/
engine.rs

1use crate::catalog::index::CandidateFinder;
2use crate::catalog::store::ReferenceCatalog;
3use crate::core::header::QueryHeader;
4use crate::core::reference::KnownReference;
5use crate::matching::diagnosis::MatchDiagnosis;
6use crate::matching::scoring::MatchScore;
7
8/// Result of matching a query against the catalog
9#[derive(Debug, Clone)]
10pub struct MatchResult {
11    /// The matched reference
12    pub reference: KnownReference,
13
14    /// Match score details
15    pub score: MatchScore,
16
17    /// Detailed diagnosis
18    pub diagnosis: MatchDiagnosis,
19}
20
21impl MatchResult {
22    #[must_use]
23    pub fn new(reference: &KnownReference, query: &QueryHeader, weights: &ScoringWeights) -> Self {
24        let score = MatchScore::calculate_with_weights(query, reference, weights);
25        let diagnosis = MatchDiagnosis::analyze(query, reference);
26
27        Self {
28            reference: reference.clone(),
29            score,
30            diagnosis,
31        }
32    }
33}
34
35/// Default minimum score threshold for matches
36pub const DEFAULT_MIN_SCORE: f64 = 0.1;
37
38/// Configuration for the matching engine
39#[derive(Debug, Clone)]
40pub struct MatchingConfig {
41    /// Minimum score threshold for including matches in results
42    pub min_score: f64,
43    /// Custom scoring weights
44    pub scoring_weights: ScoringWeights,
45}
46
47impl Default for MatchingConfig {
48    fn default() -> Self {
49        Self {
50            min_score: DEFAULT_MIN_SCORE,
51            scoring_weights: ScoringWeights::default(),
52        }
53    }
54}
55
56/// Configurable weights for the scoring algorithm
57///
58/// The scoring algorithm uses three main components:
59/// - `contig_match`: Per-contig match quality (exact, name+length, conflicts)
60/// - coverage: What fraction of the reference is covered by good matches
61/// - order: Whether contigs appear in the same order
62///
63/// Additionally, `conflict_penalty` controls how much credit MD5 conflicts receive
64/// (name+length matches but MD5 differs, indicating different sequences).
65#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
66pub struct ScoringWeights {
67    /// Weight for per-contig match score (default 0.70 = 70%)
68    #[serde(default = "default_contig_match")]
69    pub contig_match: f64,
70
71    /// Weight for reference coverage (default 0.20 = 20%)
72    #[serde(default = "default_coverage")]
73    pub coverage: f64,
74
75    /// Weight for contig ordering (default 0.10 = 10%)
76    #[serde(default = "default_order")]
77    pub order: f64,
78
79    /// Credit given to MD5 conflicts (default 0.1 = 10% credit)
80    /// Set to 0.0 for zero credit on conflicts, 1.0 to treat conflicts as matches
81    #[serde(default = "default_conflict_penalty")]
82    pub conflict_penalty: f64,
83}
84
85fn default_contig_match() -> f64 {
86    0.70
87}
88fn default_coverage() -> f64 {
89    0.20
90}
91fn default_order() -> f64 {
92    0.10
93}
94fn default_conflict_penalty() -> f64 {
95    0.1
96}
97
98impl Default for ScoringWeights {
99    fn default() -> Self {
100        Self {
101            contig_match: 0.70,    // 70%
102            coverage: 0.20,        // 20%
103            order: 0.10,           // 10%
104            conflict_penalty: 0.1, // 10% credit for conflicts
105        }
106    }
107}
108
109impl ScoringWeights {
110    /// Normalize the main weights (`contig_match`, coverage, order) to sum to 1.0
111    /// The `conflict_penalty` is kept as-is since it's a multiplier, not a weight.
112    #[must_use]
113    pub fn normalized(&self) -> Self {
114        let total = self.contig_match + self.coverage + self.order;
115
116        if total <= 0.0 {
117            return Self::default();
118        }
119
120        Self {
121            contig_match: self.contig_match / total,
122            coverage: self.coverage / total,
123            order: self.order / total,
124            conflict_penalty: self.conflict_penalty.clamp(0.0, 1.0),
125        }
126    }
127}
128
129/// The main matching engine
130pub struct MatchingEngine<'a> {
131    catalog: &'a ReferenceCatalog,
132    /// Configuration including scoring weights and thresholds
133    config: MatchingConfig,
134}
135
136impl<'a> MatchingEngine<'a> {
137    /// Create a new matching engine with custom configuration
138    #[must_use]
139    pub fn new(catalog: &'a ReferenceCatalog, config: MatchingConfig) -> Self {
140        Self { catalog, config }
141    }
142
143    /// Find the best matching references for a query
144    #[must_use]
145    pub fn find_matches(&self, query: &QueryHeader, limit: usize) -> Vec<MatchResult> {
146        // Step 1: Try exact signature match
147        if let Some(sig) = &query.signature {
148            if let Some(reference) = self.catalog.find_by_signature(sig) {
149                return vec![MatchResult::new(
150                    reference,
151                    query,
152                    &self.config.scoring_weights,
153                )];
154            }
155        }
156
157        // Step 2: Find candidates via index
158        let finder = CandidateFinder::new(self.catalog);
159        let candidate_indices = finder.find_top_candidates(query, limit * 2);
160
161        // Step 3: Score and rank candidates with custom weights
162        let mut results: Vec<MatchResult> = candidate_indices
163            .into_iter()
164            .map(|idx| {
165                let reference = &self.catalog.references[idx];
166                MatchResult::new(reference, query, &self.config.scoring_weights)
167            })
168            .collect();
169
170        // Sort by composite score descending
171        results.sort_by(|a, b| {
172            b.score
173                .composite
174                .partial_cmp(&a.score.composite)
175                .unwrap_or(std::cmp::Ordering::Equal)
176        });
177
178        // Filter to meaningful matches and limit
179        results
180            .into_iter()
181            .filter(|r| r.score.composite > self.config.min_score)
182            .take(limit)
183            .collect()
184    }
185
186    /// Find the single best match
187    #[cfg(test)]
188    #[must_use]
189    pub fn find_best_match(&self, query: &QueryHeader) -> Option<MatchResult> {
190        self.find_matches(query, 1).into_iter().next()
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197    use crate::core::contig::Contig;
198
199    fn make_test_catalog() -> ReferenceCatalog {
200        ReferenceCatalog::load_embedded().unwrap()
201    }
202
203    #[test]
204    fn test_find_matches_grch38() {
205        let catalog = make_test_catalog();
206        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
207
208        // Create query with GRCh38 chromosomes
209        let contigs = vec![
210            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
211            Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
212            Contig::new("chr3", 198_295_559).with_md5("76635a41ea913a405ded820447d067b0"),
213        ];
214        let query = QueryHeader::new(contigs);
215
216        let matches = engine.find_matches(&query, 5);
217        assert!(!matches.is_empty());
218
219        // Best match should be a GRCh38 reference
220        let best = &matches[0];
221        assert!(
222            best.reference.display_name.contains("38")
223                || best.reference.display_name.contains("hg38"),
224            "Expected GRCh38 match, got: {}",
225            best.reference.display_name
226        );
227    }
228
229    #[test]
230    fn test_find_matches_grch37() {
231        let catalog = make_test_catalog();
232        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
233
234        // Create query with GRCh37 chromosomes
235        let contigs = vec![
236            Contig::new("chr1", 249_250_621).with_md5("1b22b98cdeb4a9304cb5d48026a85128"),
237            Contig::new("chr2", 243_199_373).with_md5("a0d9851da00400dec1098a9255ac712e"),
238        ];
239        let query = QueryHeader::new(contigs);
240
241        let matches = engine.find_matches(&query, 5);
242        assert!(!matches.is_empty());
243
244        // Best match should be a GRCh37/hg19 reference
245        let best = &matches[0];
246        assert!(
247            best.reference.display_name.contains("37")
248                || best.reference.display_name.contains("hg19")
249                || best.reference.display_name.contains("b37"),
250            "Expected GRCh37 match, got: {}",
251            best.reference.display_name
252        );
253    }
254
255    #[test]
256    fn test_find_matches_ncbi_naming() {
257        let catalog = make_test_catalog();
258        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
259
260        // Create query with NCBI naming but GRCh38 sequences
261        let contigs = vec![
262            Contig::new("1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
263            Contig::new("2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
264        ];
265        let query = QueryHeader::new(contigs);
266
267        let matches = engine.find_matches(&query, 5);
268        assert!(!matches.is_empty());
269
270        // Should still match GRCh38 references (score may be lower with only 2 contigs)
271        let best = &matches[0];
272        assert!(best.score.composite > 0.1); // Found a match above threshold
273        assert!(
274            best.reference.display_name.contains("38")
275                || best.reference.display_name.contains("hs38"),
276            "Expected GRCh38 match, got: {}",
277            best.reference.display_name
278        );
279    }
280
281    #[test]
282    fn test_find_matches_no_md5() {
283        let catalog = make_test_catalog();
284        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
285
286        // Create query without MD5s
287        let contigs = vec![
288            Contig::new("chr1", 248_956_422),
289            Contig::new("chr2", 242_193_529),
290            Contig::new("chr3", 198_295_559),
291        ];
292        let query = QueryHeader::new(contigs);
293
294        // Should still find matches based on name+length
295        let matches = engine.find_matches(&query, 5);
296        assert!(!matches.is_empty());
297    }
298
299    #[test]
300    fn test_find_best_match() {
301        let catalog = make_test_catalog();
302        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
303
304        let contigs =
305            vec![Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")];
306        let query = QueryHeader::new(contigs);
307
308        let best = engine.find_best_match(&query);
309        assert!(best.is_some());
310    }
311
312    #[test]
313    fn test_no_matches_for_invalid_contigs() {
314        let catalog = make_test_catalog();
315        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
316
317        // Create query with invalid contigs
318        let contigs = vec![
319            Contig::new("invalid_contig", 12345),
320            Contig::new("another_invalid", 67890),
321        ];
322        let query = QueryHeader::new(contigs);
323
324        let matches = engine.find_matches(&query, 5);
325        // Should return empty or low-scoring matches
326        assert!(matches.is_empty() || matches[0].score.composite < 0.2);
327    }
328
329    // ========================================================================
330    // UCSC-Style Patch Name Matching Integration Tests
331    // ========================================================================
332
333    #[test]
334    fn test_ucsc_style_fix_patch_matching() {
335        // Test that queries with UCSC-style fix-patch names match catalog references
336        // The catalog should have these names as aliases (generated from NCBI reports)
337        let catalog = make_test_catalog();
338        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
339
340        // Query with UCSC-style fix-patch name (chr1_KN196472v1_fix)
341        // This should match grch38_v38 (or similar) which has KN196472.1 as a fix-patch
342        let contigs = vec![
343            // Primary chromosomes with MD5s for strong matching
344            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
345            Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
346            // UCSC-style fix-patch name (generated from KN196472.1)
347            Contig::new("chr1_KN196472v1_fix", 186_494),
348        ];
349        let query = QueryHeader::new(contigs);
350
351        let matches = engine.find_matches(&query, 5);
352        assert!(
353            !matches.is_empty(),
354            "Should find matches for UCSC-style patch names"
355        );
356
357        // Best match should be a GRCh38 reference (since fix-patches are GRCh38-specific)
358        let best = &matches[0];
359        assert!(
360            best.reference.display_name.contains("38")
361                || best.reference.display_name.contains("hg38")
362                || best.reference.display_name.contains("hs38"),
363            "Expected GRCh38 match for fix-patch query, got: {}",
364            best.reference.display_name
365        );
366    }
367
368    #[test]
369    fn test_ucsc_style_novel_patch_matching() {
370        // Test that queries with UCSC-style novel-patch (alt) names match catalog references
371        let catalog = make_test_catalog();
372        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
373
374        // Query with UCSC-style novel-patch name (chr1_KQ458382v1_alt)
375        let contigs = vec![
376            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
377            Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
378            // UCSC-style novel-patch name (generated from KQ458382.1)
379            Contig::new("chr1_KQ458382v1_alt", 141_019),
380        ];
381        let query = QueryHeader::new(contigs);
382
383        let matches = engine.find_matches(&query, 5);
384        assert!(
385            !matches.is_empty(),
386            "Should find matches for UCSC-style alt patch names"
387        );
388
389        // Best match should be a GRCh38 reference
390        let best = &matches[0];
391        assert!(
392            best.reference.display_name.contains("38")
393                || best.reference.display_name.contains("hg38")
394                || best.reference.display_name.contains("hs38"),
395            "Expected GRCh38 match for novel-patch query, got: {}",
396            best.reference.display_name
397        );
398    }
399
400    #[test]
401    fn test_ucsc_style_y_chromosome_patch_matching() {
402        // Test Y chromosome fix-patch matching
403        let catalog = make_test_catalog();
404        let engine = MatchingEngine::new(&catalog, MatchingConfig::default());
405
406        let contigs = vec![
407            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
408            Contig::new("chrY", 57_227_415).with_md5("ce3e31103314a704255f3cd90369ecce"),
409            // Y chromosome fix-patch (generated from KN196487.1)
410            Contig::new("chrY_KN196487v1_fix", 101_150),
411        ];
412        let query = QueryHeader::new(contigs);
413
414        let matches = engine.find_matches(&query, 5);
415        assert!(
416            !matches.is_empty(),
417            "Should find matches for Y chromosome fix-patch names"
418        );
419    }
420
421    #[test]
422    fn test_catalog_contains_ucsc_patch_aliases() {
423        // Verify that the embedded catalog has UCSC-style names indexed for matching
424        // Note: The matching tests above verify functional matching works. This test
425        // verifies the catalog structure supports UCSC-style names.
426        let catalog = make_test_catalog();
427
428        // Check that some GRCh38 references have UCSC-style names indexed
429        // Either in name_length_to_refs or alias_length_to_refs
430        let has_fix_patch_index = catalog
431            .name_length_to_refs
432            .keys()
433            .any(|(name, _)| name.contains("_fix") && name.starts_with("chr"))
434            || catalog
435                .alias_length_to_refs
436                .keys()
437                .any(|(name, _)| name.contains("_fix") && name.starts_with("chr"));
438
439        let has_alt_patch_index = catalog.name_length_to_refs.keys().any(|(name, _)| {
440            name.ends_with("_alt") && name.starts_with("chr") && name.contains('v')
441        }) || catalog.alias_length_to_refs.keys().any(|(name, _)| {
442            name.ends_with("_alt") && name.starts_with("chr") && name.contains('v')
443        });
444
445        // At minimum, the matching tests above prove UCSC patch names work.
446        // This test is informational - if the catalog structure changes,
447        // we want to ensure UCSC names are still indexed somewhere.
448        if !has_fix_patch_index && !has_alt_patch_index {
449            // Check if any reference has UCSC-style names in its name_length_set
450            let any_ref_has_patches = catalog.references.iter().any(|r| {
451                r.name_length_set.iter().any(|(name, _)| {
452                    (name.contains("_fix") || name.contains("_alt"))
453                        && name.starts_with("chr")
454                        && name.contains('v')
455                })
456            });
457
458            assert!(
459                any_ref_has_patches,
460                "Catalog should have UCSC-style patch names indexed for matching. \
461                 The matching tests pass, so this may indicate a catalog structure change."
462            );
463        }
464    }
465}