Skip to main content

ref_solver/matching/
scoring.rs

1use std::collections::HashSet;
2
3use crate::core::contig::Contig;
4use crate::core::header::QueryHeader;
5use crate::core::reference::KnownReference;
6use crate::core::types::Confidence;
7
8/// Classification of how a query contig matches a reference
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum ContigMatchType {
11    /// Name+length match AND MD5 matches (or both are primary chromosomes with same length)
12    Exact,
13    /// Name+length match, but MD5 not available on one or both sides (neutral)
14    NameLengthNoMd5,
15    /// Name+length match, but MD5 differs (different sequence - heavy penalty!)
16    Md5Conflict,
17    /// No name+length match found
18    Unmatched,
19}
20
21/// Safely convert usize to f64 for percentage calculations
22///
23/// This function explicitly handles the precision loss that occurs when converting
24/// usize to f64 on 64-bit platforms. For typical bioinformatics use cases, the
25/// numbers are well within the safe range of f64 mantissa precision.
26#[inline]
27fn count_to_f64(count: usize) -> f64 {
28    #[allow(clippy::cast_precision_loss)]
29    {
30        count as f64
31    }
32}
33
34/// Detailed similarity scores between a query and a reference
35#[derive(Debug, Clone)]
36pub struct MatchScore {
37    /// Weighted composite score (the primary ranking metric)
38    pub composite: f64,
39
40    /// Confidence level derived from score
41    pub confidence: Confidence,
42
43    // === New per-contig match classification metrics ===
44    /// Number of contigs with exact match (name+length+MD5 all match)
45    pub exact_matches: usize,
46
47    /// Number of contigs with name+length match but no MD5 available (neutral)
48    pub name_length_matches: usize,
49
50    /// Number of contigs with name+length match but MD5 conflicts (different sequence!)
51    pub md5_conflicts: usize,
52
53    /// Number of contigs with no match found
54    pub unmatched: usize,
55
56    // === Derived scores ===
57    /// Per-contig match score: (exact + neutral + 0.1*conflicts) / total
58    pub match_quality: f64,
59
60    /// Coverage score: `good_matches` / `reference_contigs`
61    pub coverage_score: f64,
62
63    /// Fraction of contigs in correct relative order
64    pub order_score: f64,
65
66    /// Are matched contigs in the same order?
67    pub order_preserved: bool,
68
69    // === Legacy metrics (for backward compatibility) ===
70    /// Jaccard similarity of MD5 sets: |intersection| / |union|
71    pub md5_jaccard: f64,
72
73    /// Jaccard similarity of (`normalized_name`, length) pairs
74    pub name_length_jaccard: f64,
75
76    /// Fraction of query contigs matched by MD5
77    pub md5_query_coverage: f64,
78
79    /// Fraction of query contigs matched by name+length
80    pub name_length_query_coverage: f64,
81}
82
83impl MatchScore {
84    /// Calculate match score between query and reference
85    ///
86    /// Uses the new contig-based scoring algorithm:
87    /// - Classifies each query contig as Exact, `NameLengthNoMd5`, `Md5Conflict`, or Unmatched
88    /// - Exact and `NameLengthNoMd5` get full credit (1.0)
89    /// - `Md5Conflict` gets heavy penalty (0.1 credit - name is right but sequence is wrong)
90    /// - Unmatched gets no credit
91    ///
92    /// Composite = 70% `match_quality` + 20% `coverage_score` + 10% `order_score`
93    #[must_use]
94    pub fn calculate(query: &QueryHeader, reference: &KnownReference) -> Self {
95        // Classify each query contig
96        let mut exact_matches = 0usize;
97        let mut name_length_matches = 0usize; // No MD5 available
98        let mut md5_conflicts = 0usize;
99        let mut unmatched = 0usize;
100
101        for contig in &query.contigs {
102            match classify_contig_match(contig, reference) {
103                ContigMatchType::Exact => exact_matches += 1,
104                ContigMatchType::NameLengthNoMd5 => name_length_matches += 1,
105                ContigMatchType::Md5Conflict => md5_conflicts += 1,
106                ContigMatchType::Unmatched => unmatched += 1,
107            }
108        }
109
110        let total_query = query.contigs.len();
111
112        // Contig match score: full credit for exact/neutral, 10% for conflicts, 0 for unmatched
113        // Key principle: MD5 absence is neutral (full credit), MD5 conflict is penalized
114        let weighted_matches = count_to_f64(exact_matches)
115            + count_to_f64(name_length_matches)
116            + (count_to_f64(md5_conflicts) * 0.1);
117
118        let match_quality = if total_query > 0 {
119            weighted_matches / count_to_f64(total_query)
120        } else {
121            0.0
122        };
123
124        // Coverage: what fraction of reference is covered by good matches (exact + neutral)
125        let good_matches = exact_matches + name_length_matches;
126        let coverage_score = if reference.contigs.is_empty() {
127            0.0
128        } else {
129            (count_to_f64(good_matches) / count_to_f64(reference.contigs.len())).min(1.0)
130        };
131
132        // Order analysis (existing function)
133        let (order_preserved, order_score) = analyze_order(query, reference);
134
135        // New composite formula: 70% match quality, 20% coverage, 10% order
136        // Clamp to [0.0, 1.0] to handle floating point precision issues
137        let composite = ((0.70 * match_quality) + (0.20 * coverage_score) + (0.10 * order_score))
138            .clamp(0.0, 1.0);
139
140        let confidence = Confidence::from_score(composite);
141
142        // Compute legacy metrics for backward compatibility
143        let md5_jaccard = jaccard_similarity(&query.md5_set, &reference.md5_set);
144        let (name_length_jaccard, name_length_query_coverage) =
145            calculate_name_length_similarity_with_aliases(query, reference);
146        let md5_query_coverage = if query.md5_set.is_empty() {
147            0.0
148        } else {
149            count_to_f64(query.md5_set.intersection(&reference.md5_set).count())
150                / count_to_f64(query.md5_set.len())
151        };
152
153        Self {
154            composite,
155            confidence,
156            exact_matches,
157            name_length_matches,
158            md5_conflicts,
159            unmatched,
160            match_quality,
161            coverage_score,
162            order_score,
163            order_preserved,
164            md5_jaccard,
165            name_length_jaccard,
166            md5_query_coverage,
167            name_length_query_coverage,
168        }
169    }
170
171    /// Calculate match score with custom scoring weights
172    ///
173    /// Uses the new contig-based scoring algorithm with configurable weights:
174    /// - `contig_match_weight`: Weight for per-contig match score (default 70%)
175    /// - `coverage_weight`: Weight for reference coverage (default 20%)
176    /// - `order_weight`: Weight for contig ordering (default 10%)
177    /// - `conflict_penalty`: Credit given to MD5 conflicts (default 0.1 = 10%)
178    #[must_use]
179    pub fn calculate_with_weights(
180        query: &QueryHeader,
181        reference: &KnownReference,
182        weights: &crate::matching::engine::ScoringWeights,
183    ) -> Self {
184        // Classify each query contig
185        let mut exact_matches = 0usize;
186        let mut name_length_matches = 0usize;
187        let mut md5_conflicts = 0usize;
188        let mut unmatched = 0usize;
189
190        for contig in &query.contigs {
191            match classify_contig_match(contig, reference) {
192                ContigMatchType::Exact => exact_matches += 1,
193                ContigMatchType::NameLengthNoMd5 => name_length_matches += 1,
194                ContigMatchType::Md5Conflict => md5_conflicts += 1,
195                ContigMatchType::Unmatched => unmatched += 1,
196            }
197        }
198
199        let total_query = query.contigs.len();
200
201        // Normalize weights
202        let normalized_weights = weights.normalized();
203
204        // Contig match score using configurable conflict penalty
205        let weighted_matches = count_to_f64(exact_matches)
206            + count_to_f64(name_length_matches)
207            + (count_to_f64(md5_conflicts) * normalized_weights.conflict_penalty);
208
209        let match_quality = if total_query > 0 {
210            weighted_matches / count_to_f64(total_query)
211        } else {
212            0.0
213        };
214
215        // Coverage: what fraction of reference is covered by good matches
216        let good_matches = exact_matches + name_length_matches;
217        let coverage_score = if reference.contigs.is_empty() {
218            0.0
219        } else {
220            (count_to_f64(good_matches) / count_to_f64(reference.contigs.len())).min(1.0)
221        };
222
223        // Order analysis
224        let (order_preserved, order_score) = analyze_order(query, reference);
225
226        // Composite with custom weights
227        // Clamp to [0.0, 1.0] to handle floating point precision issues
228        let composite = ((normalized_weights.contig_match * match_quality)
229            + (normalized_weights.coverage * coverage_score)
230            + (normalized_weights.order * order_score))
231            .clamp(0.0, 1.0);
232
233        let confidence = Confidence::from_score(composite);
234
235        // Compute legacy metrics for backward compatibility
236        let md5_jaccard = jaccard_similarity(&query.md5_set, &reference.md5_set);
237        let (name_length_jaccard, name_length_query_coverage) =
238            calculate_name_length_similarity_with_aliases(query, reference);
239        let md5_query_coverage = if query.md5_set.is_empty() {
240            0.0
241        } else {
242            count_to_f64(query.md5_set.intersection(&reference.md5_set).count())
243                / count_to_f64(query.md5_set.len())
244        };
245
246        Self {
247            composite,
248            confidence,
249            exact_matches,
250            name_length_matches,
251            md5_conflicts,
252            unmatched,
253            match_quality,
254            coverage_score,
255            order_score,
256            order_preserved,
257            md5_jaccard,
258            name_length_jaccard,
259            md5_query_coverage,
260            name_length_query_coverage,
261        }
262    }
263}
264
265/// Jaccard similarity: |A ∩ B| / |A ∪ B|
266///
267/// Returns 0.0 when both sets are empty (undefined mathematically, but 0.0 is safer
268/// for matching to avoid false positives from two references with no MD5s).
269fn jaccard_similarity<T: Eq + std::hash::Hash>(a: &HashSet<T>, b: &HashSet<T>) -> f64 {
270    let intersection = a.intersection(b).count();
271    let union = a.union(b).count();
272    if union == 0 {
273        // Both sets empty - return 0.0 to avoid false positive matches
274        0.0
275    } else {
276        count_to_f64(intersection) / count_to_f64(union)
277    }
278}
279
280/// Calculate name+length similarity accounting for alias-based matching.
281///
282/// A query contig matches a reference contig if:
283/// - Query name+length matches reference name+length directly, OR
284/// - Query alias+length matches reference name+length (reverse alias matching)
285///
286/// Returns (`jaccard_similarity`, `query_coverage`)
287fn calculate_name_length_similarity_with_aliases(
288    query: &QueryHeader,
289    reference: &KnownReference,
290) -> (f64, f64) {
291    // Count query contigs that match (by name or alias)
292    let mut matched_query_contigs = 0usize;
293    let mut matched_ref_keys: HashSet<(String, u64)> = HashSet::new();
294
295    for contig in &query.contigs {
296        let name_key = (contig.name.clone(), contig.length);
297
298        // Check direct name match
299        if reference.name_length_set.contains(&name_key) {
300            matched_query_contigs += 1;
301            matched_ref_keys.insert(name_key);
302            continue;
303        }
304
305        // Check alias matches (query alias -> ref name)
306        for alias in &contig.aliases {
307            let alias_key = (alias.clone(), contig.length);
308            if reference.name_length_set.contains(&alias_key) {
309                matched_query_contigs += 1;
310                matched_ref_keys.insert(alias_key);
311                break;
312            }
313        }
314    }
315
316    // Calculate Jaccard: |matched| / |union|
317    // Union = query contigs + ref contigs - matched (to avoid double counting)
318    let query_count = query.contigs.len();
319    let ref_count = reference.contigs.len();
320    let union_size = query_count + ref_count - matched_ref_keys.len();
321
322    let jaccard = if union_size == 0 {
323        0.0
324    } else {
325        count_to_f64(matched_ref_keys.len()) / count_to_f64(union_size)
326    };
327
328    let coverage = if query_count == 0 {
329        0.0
330    } else {
331        count_to_f64(matched_query_contigs) / count_to_f64(query_count)
332    };
333
334    (jaccard, coverage)
335}
336
337/// Analyze if contigs are in the same order.
338/// Considers both direct name matches and alias-based matches.
339///
340/// Returns (`order_preserved`, `order_score`) where:
341/// - `order_preserved`: true if all matched contigs appear in the same order
342/// - `order_score`: fraction of contigs in the longest increasing subsequence
343///
344/// When there are < 2 matching contigs, returns (true, 0.0) since order
345/// cannot be meaningfully assessed with so few matches.
346fn analyze_order(query: &QueryHeader, reference: &KnownReference) -> (bool, f64) {
347    use std::collections::HashMap;
348
349    // Build position map for reference using exact names AND aliases
350    // This allows order matching when query uses different naming convention
351    let mut ref_positions: HashMap<(&str, u64), usize> = HashMap::new();
352    for (i, contig) in reference.contigs.iter().enumerate() {
353        // Add primary name
354        ref_positions.insert((contig.name.as_str(), contig.length), i);
355        // Add all aliases (so query "chr1" can find reference "1" with alias "chr1")
356        for alias in &contig.aliases {
357            ref_positions.insert((alias.as_str(), contig.length), i);
358        }
359    }
360
361    // Get positions of query contigs in reference ordering
362    // Check both direct name match and alias matches
363    let mut positions: Vec<usize> = Vec::new();
364    for contig in &query.contigs {
365        // Try direct name match first
366        let key = (contig.name.as_str(), contig.length);
367        if let Some(&pos) = ref_positions.get(&key) {
368            positions.push(pos);
369            continue;
370        }
371
372        // Try alias matches
373        for alias in &contig.aliases {
374            let alias_key = (alias.as_str(), contig.length);
375            if let Some(&pos) = ref_positions.get(&alias_key) {
376                positions.push(pos);
377                break;
378            }
379        }
380    }
381
382    // Need at least 2 matching contigs to assess ordering
383    if positions.len() < 2 {
384        // Return neutral values: order is trivially preserved, but score is 0
385        // to avoid inflating composite scores with insufficient data
386        return (true, 0.0);
387    }
388
389    // Check if sorted
390    let is_sorted = positions.windows(2).all(|w| w[0] < w[1]);
391
392    // Calculate longest increasing subsequence for order score
393    let lis_len = longest_increasing_subsequence(&positions);
394    let order_score = count_to_f64(lis_len) / count_to_f64(positions.len());
395
396    (is_sorted, order_score)
397}
398
399/// Length of longest increasing subsequence (LIS).
400///
401/// Returns 0 for empty arrays, otherwise the length of the LIS (minimum 1).
402fn longest_increasing_subsequence(arr: &[usize]) -> usize {
403    if arr.is_empty() {
404        return 0;
405    }
406
407    // dp[i] = length of LIS ending at index i (minimum 1 for any single element)
408    let mut dp = vec![1usize; arr.len()];
409
410    for i in 1..arr.len() {
411        for j in 0..i {
412            if arr[j] < arr[i] {
413                dp[i] = dp[i].max(dp[j] + 1);
414            }
415        }
416    }
417
418    // Safety: dp is non-empty (arr.len() >= 1 after early return check)
419    dp.into_iter().max().expect("dp is non-empty")
420}
421
422/// Classify how a query contig matches against a reference.
423///
424/// Returns the match type indicating whether the contig:
425/// - Has an exact match (name+length+MD5 all match)
426/// - Has a name+length match but no MD5 to compare (neutral)
427/// - Has a name+length match but MD5 differs (conflict - different sequence!)
428/// - Has no match
429fn classify_contig_match(query_contig: &Contig, reference: &KnownReference) -> ContigMatchType {
430    // First, find if there's a name+length match (including aliases)
431    let matched_ref_contig = find_matching_reference_contig(query_contig, reference);
432
433    match matched_ref_contig {
434        None => ContigMatchType::Unmatched,
435        Some(ref_contig) => {
436            // Name+length matched, now check MD5
437            match (&query_contig.md5, &ref_contig.md5) {
438                // Both have MD5 - compare them
439                (Some(q_md5), Some(r_md5)) => {
440                    if q_md5.eq_ignore_ascii_case(r_md5) {
441                        ContigMatchType::Exact
442                    } else {
443                        // MD5 mismatch - same name/length but different sequence!
444                        ContigMatchType::Md5Conflict
445                    }
446                }
447                // Either or both missing MD5 - neutral (can't verify but not a conflict)
448                _ => ContigMatchType::NameLengthNoMd5,
449            }
450        }
451    }
452}
453
454/// Find the reference contig that matches the query contig by name+length.
455/// Checks both direct name match and alias matches.
456fn find_matching_reference_contig<'a>(
457    query_contig: &Contig,
458    reference: &'a KnownReference,
459) -> Option<&'a Contig> {
460    // Check if query name+length matches any reference contig
461    let query_key = (query_contig.name.clone(), query_contig.length);
462    if reference.name_length_set.contains(&query_key) {
463        // Find the actual contig (could be by name or alias)
464        for ref_contig in &reference.contigs {
465            if ref_contig.name == query_contig.name && ref_contig.length == query_contig.length {
466                return Some(ref_contig);
467            }
468            // Check if query name matches a reference alias
469            for alias in &ref_contig.aliases {
470                if alias == &query_contig.name && ref_contig.length == query_contig.length {
471                    return Some(ref_contig);
472                }
473            }
474        }
475    }
476
477    // Check if any query alias matches a reference name or alias
478    for query_alias in &query_contig.aliases {
479        let alias_key = (query_alias.clone(), query_contig.length);
480        if reference.name_length_set.contains(&alias_key) {
481            // Find the actual contig
482            for ref_contig in &reference.contigs {
483                if ref_contig.name == *query_alias && ref_contig.length == query_contig.length {
484                    return Some(ref_contig);
485                }
486                for ref_alias in &ref_contig.aliases {
487                    if ref_alias == query_alias && ref_contig.length == query_contig.length {
488                        return Some(ref_contig);
489                    }
490                }
491            }
492        }
493    }
494
495    None
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_jaccard_similarity() {
504        let a: HashSet<i32> = [1, 2, 3].into_iter().collect();
505        let b: HashSet<i32> = [2, 3, 4].into_iter().collect();
506
507        let similarity = jaccard_similarity(&a, &b);
508        // intersection = {2, 3} = 2, union = {1, 2, 3, 4} = 4
509        assert!((similarity - 0.5).abs() < 0.001);
510
511        // Empty sets should return 0.0 to avoid false positives
512        let empty1: HashSet<i32> = HashSet::new();
513        let empty2: HashSet<i32> = HashSet::new();
514        assert!((jaccard_similarity(&empty1, &empty2) - 0.0).abs() < 0.001);
515
516        // One empty, one non-empty should return 0.0
517        assert!((jaccard_similarity(&a, &empty1) - 0.0).abs() < 0.001);
518
519        // Identical sets should return 1.0
520        let c: HashSet<i32> = [1, 2, 3].into_iter().collect();
521        assert!((jaccard_similarity(&a, &c) - 1.0).abs() < 0.001);
522    }
523
524    #[test]
525    fn test_longest_increasing_subsequence() {
526        assert_eq!(longest_increasing_subsequence(&[1, 2, 3, 4, 5]), 5);
527        assert_eq!(longest_increasing_subsequence(&[5, 4, 3, 2, 1]), 1);
528        assert_eq!(longest_increasing_subsequence(&[1, 3, 2, 4, 5]), 4);
529        assert_eq!(longest_increasing_subsequence(&[]), 0);
530    }
531
532    #[test]
533    fn test_composite_score_never_exceeds_one() {
534        use crate::core::contig::Contig;
535        use crate::core::reference::KnownReference;
536        use crate::core::types::{Assembly, ReferenceSource};
537
538        // Create a reference with some contigs
539        let ref_contigs = vec![
540            Contig::new("chr1", 1000),
541            Contig::new("chr2", 2000),
542            Contig::new("chr3", 3000),
543        ];
544        let reference = KnownReference::new(
545            "test_ref",
546            "Test Reference",
547            Assembly::Grch38,
548            ReferenceSource::Custom("test".to_string()),
549        )
550        .with_contigs(ref_contigs);
551
552        // Test with query that has NO MD5s (triggers the redistribution code path)
553        let query_no_md5 = QueryHeader::new(vec![
554            Contig::new("chr1", 1000),
555            Contig::new("chr2", 2000),
556            Contig::new("chr3", 3000),
557        ]);
558
559        let score_no_md5 = MatchScore::calculate(&query_no_md5, &reference);
560        assert!(
561            score_no_md5.composite <= 1.0,
562            "Composite score without MD5 should not exceed 1.0, got {}",
563            score_no_md5.composite
564        );
565        assert!(
566            score_no_md5.composite >= 0.0,
567            "Composite score should not be negative, got {}",
568            score_no_md5.composite
569        );
570
571        // Test with query that HAS MD5s
572        let query_with_md5 = QueryHeader::new(vec![
573            Contig::new("chr1", 1000).with_md5("abc123"),
574            Contig::new("chr2", 2000).with_md5("def456"),
575            Contig::new("chr3", 3000).with_md5("ghi789"),
576        ]);
577
578        let score_with_md5 = MatchScore::calculate(&query_with_md5, &reference);
579        assert!(
580            score_with_md5.composite <= 1.0,
581            "Composite score with MD5 should not exceed 1.0, got {}",
582            score_with_md5.composite
583        );
584        assert!(
585            score_with_md5.composite >= 0.0,
586            "Composite score should not be negative, got {}",
587            score_with_md5.composite
588        );
589    }
590
591    #[test]
592    fn test_composite_score_with_custom_weights_never_exceeds_one() {
593        use crate::core::contig::Contig;
594        use crate::core::reference::KnownReference;
595        use crate::core::types::{Assembly, ReferenceSource};
596        use crate::matching::engine::ScoringWeights;
597
598        let ref_contigs = vec![Contig::new("chr1", 1000), Contig::new("chr2", 2000)];
599        let reference = KnownReference::new(
600            "test_ref",
601            "Test Reference",
602            Assembly::Grch38,
603            ReferenceSource::Custom("test".to_string()),
604        )
605        .with_contigs(ref_contigs);
606
607        let query = QueryHeader::new(vec![Contig::new("chr1", 1000), Contig::new("chr2", 2000)]);
608
609        // Test with various weight configurations
610        let weight_configs = vec![
611            ScoringWeights {
612                contig_match: 0.7,
613                coverage: 0.2,
614                order: 0.1,
615                conflict_penalty: 0.1,
616            },
617            ScoringWeights {
618                contig_match: 0.5,
619                coverage: 0.3,
620                order: 0.2,
621                conflict_penalty: 0.0,
622            },
623            ScoringWeights {
624                contig_match: 1.0,
625                coverage: 1.0,
626                order: 1.0,
627                conflict_penalty: 0.5,
628            },
629        ];
630
631        for weights in weight_configs {
632            let score = MatchScore::calculate_with_weights(&query, &reference, &weights);
633            assert!(
634                score.composite <= 1.0,
635                "Composite score should not exceed 1.0 with weights {:?}, got {}",
636                weights,
637                score.composite
638            );
639            assert!(
640                score.composite >= 0.0,
641                "Composite score should not be negative with weights {:?}, got {}",
642                weights,
643                score.composite
644            );
645        }
646    }
647
648    // Tests for all 4 alias matching combinations:
649    // 1. Query name → Ref name (direct match)
650    // 2. Query name → Ref alias (UCSC query chr1 matches NCBI ref with chr1 alias)
651    // 3. Query alias → Ref name (NCBI query with chr1 alias matches UCSC ref chr1)
652    // 4. Query alias → Ref alias (both have aliases that match)
653
654    #[test]
655    fn test_alias_match_query_name_to_ref_name() {
656        use crate::core::contig::Contig;
657        use crate::core::reference::KnownReference;
658        use crate::core::types::{Assembly, ReferenceSource};
659
660        // Direct match: query chr1 → ref chr1
661        let ref_contigs = vec![
662            Contig::new("chr1", 248_956_422),
663            Contig::new("chr2", 242_193_529),
664        ];
665        let reference = KnownReference::new(
666            "test_ref",
667            "Test Reference",
668            Assembly::Grch38,
669            ReferenceSource::Custom("test".to_string()),
670        )
671        .with_contigs(ref_contigs);
672
673        let query = QueryHeader::new(vec![
674            Contig::new("chr1", 248_956_422),
675            Contig::new("chr2", 242_193_529),
676        ]);
677
678        let score = MatchScore::calculate(&query, &reference);
679        assert!(
680            (score.name_length_jaccard - 1.0).abs() < 0.001,
681            "Direct name match should have jaccard = 1.0, got {}",
682            score.name_length_jaccard
683        );
684    }
685
686    #[test]
687    fn test_alias_match_query_name_to_ref_alias() {
688        use crate::core::contig::Contig;
689        use crate::core::reference::KnownReference;
690        use crate::core::types::{Assembly, ReferenceSource};
691
692        // Query uses UCSC names (chr1), ref uses NCBI names with chr1 as alias
693        let ref_contigs = vec![
694            Contig::new("NC_000001.11", 248_956_422)
695                .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
696            Contig::new("NC_000002.12", 242_193_529)
697                .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
698        ];
699        let reference = KnownReference::new(
700            "test_ncbi_ref",
701            "Test NCBI Reference",
702            Assembly::Grch38,
703            ReferenceSource::Custom("test".to_string()),
704        )
705        .with_contigs(ref_contigs);
706
707        // Query with UCSC names - NO aliases
708        let query = QueryHeader::new(vec![
709            Contig::new("chr1", 248_956_422),
710            Contig::new("chr2", 242_193_529),
711        ]);
712
713        let score = MatchScore::calculate(&query, &reference);
714        assert!(
715            (score.name_length_jaccard - 1.0).abs() < 0.001,
716            "Query name matching ref alias should have jaccard = 1.0, got {}",
717            score.name_length_jaccard
718        );
719    }
720
721    #[test]
722    fn test_alias_match_query_alias_to_ref_name() {
723        use crate::core::contig::Contig;
724        use crate::core::reference::KnownReference;
725        use crate::core::types::{Assembly, ReferenceSource};
726
727        // Ref uses UCSC names (chr1), query uses NCBI names with chr1 as alias
728        let ref_contigs = vec![
729            Contig::new("chr1", 248_956_422),
730            Contig::new("chr2", 242_193_529),
731        ];
732        let reference = KnownReference::new(
733            "test_ucsc_ref",
734            "Test UCSC Reference",
735            Assembly::Grch38,
736            ReferenceSource::Custom("test".to_string()),
737        )
738        .with_contigs(ref_contigs);
739
740        // Query with NCBI names that have UCSC aliases
741        let query = QueryHeader::new(vec![
742            Contig::new("NC_000001.11", 248_956_422)
743                .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
744            Contig::new("NC_000002.12", 242_193_529)
745                .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
746        ]);
747
748        let score = MatchScore::calculate(&query, &reference);
749        assert!(
750            (score.name_length_jaccard - 1.0).abs() < 0.001,
751            "Query alias matching ref name should have jaccard = 1.0, got {}",
752            score.name_length_jaccard
753        );
754    }
755
756    #[test]
757    fn test_alias_match_query_alias_to_ref_alias() {
758        use crate::core::contig::Contig;
759        use crate::core::reference::KnownReference;
760        use crate::core::types::{Assembly, ReferenceSource};
761
762        // Both query and ref use NCBI names with UCSC aliases
763        // They should match via their common alias
764        let ref_contigs = vec![
765            Contig::new("NC_000001.11", 248_956_422)
766                .with_aliases(vec!["chr1".to_string(), "CM000663.2".to_string()]),
767            Contig::new("NC_000002.12", 242_193_529)
768                .with_aliases(vec!["chr2".to_string(), "CM000664.2".to_string()]),
769        ];
770        let reference = KnownReference::new(
771            "test_ncbi_ref_1",
772            "Test NCBI Reference 1",
773            Assembly::Grch38,
774            ReferenceSource::Custom("test".to_string()),
775        )
776        .with_contigs(ref_contigs);
777
778        // Query with different primary names but overlapping aliases
779        let query = QueryHeader::new(vec![
780            Contig::new("CM000663.2", 248_956_422)
781                .with_aliases(vec!["chr1".to_string(), "NC_000001.11".to_string()]),
782            Contig::new("CM000664.2", 242_193_529)
783                .with_aliases(vec!["chr2".to_string(), "NC_000002.12".to_string()]),
784        ]);
785
786        let score = MatchScore::calculate(&query, &reference);
787        assert!(
788            (score.name_length_jaccard - 1.0).abs() < 0.001,
789            "Query alias matching ref alias should have jaccard = 1.0, got {}",
790            score.name_length_jaccard
791        );
792    }
793
794    // ========================================================================
795    // New Scoring Algorithm Tests
796    // ========================================================================
797
798    #[test]
799    fn test_perfect_name_length_match_no_md5_gets_high_score() {
800        use crate::core::contig::Contig;
801        use crate::core::reference::KnownReference;
802        use crate::core::types::{Assembly, ReferenceSource};
803
804        // Reference with 3 contigs, no MD5
805        let ref_contigs = vec![
806            Contig::new("chr1", 248_956_422),
807            Contig::new("chr2", 242_193_529),
808            Contig::new("chr3", 198_295_559),
809        ];
810        let reference = KnownReference::new(
811            "test_ref",
812            "Test Reference",
813            Assembly::Grch38,
814            ReferenceSource::Custom("test".to_string()),
815        )
816        .with_contigs(ref_contigs);
817
818        // Query matches perfectly, also no MD5
819        let query = QueryHeader::new(vec![
820            Contig::new("chr1", 248_956_422),
821            Contig::new("chr2", 242_193_529),
822            Contig::new("chr3", 198_295_559),
823        ]);
824
825        let score = MatchScore::calculate(&query, &reference);
826
827        // All contigs should be classified as NameLengthNoMd5 (neutral = full credit)
828        assert_eq!(
829            score.exact_matches, 0,
830            "No exact matches (no MD5 available)"
831        );
832        assert_eq!(
833            score.name_length_matches, 3,
834            "All 3 contigs should be name+length matches"
835        );
836        assert_eq!(score.md5_conflicts, 0, "No conflicts");
837        assert_eq!(score.unmatched, 0, "No unmatched");
838
839        // Contig match score should be 1.0 (3 neutral matches / 3 total)
840        assert!(
841            (score.match_quality - 1.0).abs() < 0.001,
842            "Contig match score should be 1.0, got {}",
843            score.match_quality
844        );
845
846        // Coverage should be 1.0 (3 good matches / 3 reference contigs)
847        assert!(
848            (score.coverage_score - 1.0).abs() < 0.001,
849            "Coverage score should be 1.0, got {}",
850            score.coverage_score
851        );
852
853        // Composite should be high: 0.7*1.0 + 0.2*1.0 + 0.1*order = 0.9 + order
854        // Order score should be 1.0 for 3 contigs in order
855        assert!(
856            score.composite > 0.9,
857            "100% name+length match with no MD5 should score > 90%, got {:.1}%",
858            score.composite * 100.0
859        );
860    }
861
862    #[test]
863    fn test_md5_conflict_gets_heavy_penalty() {
864        use crate::core::contig::Contig;
865        use crate::core::reference::KnownReference;
866        use crate::core::types::{Assembly, ReferenceSource};
867
868        // Reference with MD5
869        let ref_contigs = vec![
870            Contig::new("chr1", 248_956_422).with_md5("abc123"),
871            Contig::new("chr2", 242_193_529).with_md5("def456"),
872        ];
873        let reference = KnownReference::new(
874            "test_ref",
875            "Test Reference",
876            Assembly::Grch38,
877            ReferenceSource::Custom("test".to_string()),
878        )
879        .with_contigs(ref_contigs);
880
881        // Query with DIFFERENT MD5s (conflict!)
882        let query = QueryHeader::new(vec![
883            Contig::new("chr1", 248_956_422).with_md5("DIFFERENT1"),
884            Contig::new("chr2", 242_193_529).with_md5("DIFFERENT2"),
885        ]);
886
887        let score = MatchScore::calculate(&query, &reference);
888
889        // All contigs should be classified as Md5Conflict
890        assert_eq!(score.exact_matches, 0, "No exact matches");
891        assert_eq!(score.name_length_matches, 0, "No name+length-only matches");
892        assert_eq!(score.md5_conflicts, 2, "Both contigs have MD5 conflicts");
893        assert_eq!(score.unmatched, 0, "No unmatched");
894
895        // Contig match score should be 0.1 * 2 / 2 = 0.1 (10% credit for conflicts)
896        assert!(
897            (score.match_quality - 0.1).abs() < 0.001,
898            "Contig match score for conflicts should be 0.1, got {}",
899            score.match_quality
900        );
901
902        // Composite should be low due to conflicts
903        assert!(
904            score.composite < 0.2,
905            "MD5 conflicts should result in low score, got {:.1}%",
906            score.composite * 100.0
907        );
908    }
909
910    #[test]
911    fn test_mixed_match_types_weighted_correctly() {
912        use crate::core::contig::Contig;
913        use crate::core::reference::KnownReference;
914        use crate::core::types::{Assembly, ReferenceSource};
915
916        // Reference: 3 contigs with MD5
917        let ref_contigs = vec![
918            Contig::new("chr1", 248_956_422).with_md5("correct_md5_1"),
919            Contig::new("chr2", 242_193_529).with_md5("correct_md5_2"),
920            Contig::new("chr3", 198_295_559), // No MD5
921        ];
922        let reference = KnownReference::new(
923            "test_ref",
924            "Test Reference",
925            Assembly::Grch38,
926            ReferenceSource::Custom("test".to_string()),
927        )
928        .with_contigs(ref_contigs);
929
930        // Query: 4 contigs
931        // - chr1: exact match (MD5 matches)
932        // - chr2: conflict (MD5 differs)
933        // - chr3: name+length only (no MD5 on either side)
934        // - chr4: unmatched
935        let query = QueryHeader::new(vec![
936            Contig::new("chr1", 248_956_422).with_md5("correct_md5_1"),
937            Contig::new("chr2", 242_193_529).with_md5("WRONG_MD5"),
938            Contig::new("chr3", 198_295_559),
939            Contig::new("chr4", 100_000), // Unmatched
940        ]);
941
942        let score = MatchScore::calculate(&query, &reference);
943
944        assert_eq!(score.exact_matches, 1, "1 exact match (chr1)");
945        assert_eq!(score.name_length_matches, 1, "1 name+length match (chr3)");
946        assert_eq!(score.md5_conflicts, 1, "1 conflict (chr2)");
947        assert_eq!(score.unmatched, 1, "1 unmatched (chr4)");
948
949        // Contig match score = (1*1.0 + 1*1.0 + 1*0.1 + 0) / 4 = 2.1 / 4 = 0.525
950        let expected_match_score = (1.0 + 1.0 + 0.1) / 4.0;
951        assert!(
952            (score.match_quality - expected_match_score).abs() < 0.001,
953            "Expected match_quality = {}, got {}",
954            expected_match_score,
955            score.match_quality
956        );
957    }
958
959    #[test]
960    fn test_classify_contig_match_exact() {
961        use crate::core::contig::Contig;
962        use crate::core::reference::KnownReference;
963        use crate::core::types::{Assembly, ReferenceSource};
964
965        let ref_contigs = vec![Contig::new("chr1", 1000).with_md5("abc123")];
966        let reference = KnownReference::new(
967            "test",
968            "Test",
969            Assembly::Grch38,
970            ReferenceSource::Custom("test".to_string()),
971        )
972        .with_contigs(ref_contigs);
973
974        let query_contig = Contig::new("chr1", 1000).with_md5("abc123");
975        assert_eq!(
976            classify_contig_match(&query_contig, &reference),
977            ContigMatchType::Exact
978        );
979    }
980
981    #[test]
982    fn test_classify_contig_match_no_md5() {
983        use crate::core::contig::Contig;
984        use crate::core::reference::KnownReference;
985        use crate::core::types::{Assembly, ReferenceSource};
986
987        let ref_contigs = vec![Contig::new("chr1", 1000)]; // No MD5
988        let reference = KnownReference::new(
989            "test",
990            "Test",
991            Assembly::Grch38,
992            ReferenceSource::Custom("test".to_string()),
993        )
994        .with_contigs(ref_contigs);
995
996        let query_contig = Contig::new("chr1", 1000); // No MD5
997        assert_eq!(
998            classify_contig_match(&query_contig, &reference),
999            ContigMatchType::NameLengthNoMd5
1000        );
1001    }
1002
1003    #[test]
1004    fn test_classify_contig_match_md5_conflict() {
1005        use crate::core::contig::Contig;
1006        use crate::core::reference::KnownReference;
1007        use crate::core::types::{Assembly, ReferenceSource};
1008
1009        let ref_contigs = vec![Contig::new("chr1", 1000).with_md5("abc123")];
1010        let reference = KnownReference::new(
1011            "test",
1012            "Test",
1013            Assembly::Grch38,
1014            ReferenceSource::Custom("test".to_string()),
1015        )
1016        .with_contigs(ref_contigs);
1017
1018        let query_contig = Contig::new("chr1", 1000).with_md5("DIFFERENT");
1019        assert_eq!(
1020            classify_contig_match(&query_contig, &reference),
1021            ContigMatchType::Md5Conflict
1022        );
1023    }
1024
1025    #[test]
1026    fn test_classify_contig_match_unmatched() {
1027        use crate::core::contig::Contig;
1028        use crate::core::reference::KnownReference;
1029        use crate::core::types::{Assembly, ReferenceSource};
1030
1031        let ref_contigs = vec![Contig::new("chr1", 1000)];
1032        let reference = KnownReference::new(
1033            "test",
1034            "Test",
1035            Assembly::Grch38,
1036            ReferenceSource::Custom("test".to_string()),
1037        )
1038        .with_contigs(ref_contigs);
1039
1040        // Different name
1041        let query_contig = Contig::new("chr2", 1000);
1042        assert_eq!(
1043            classify_contig_match(&query_contig, &reference),
1044            ContigMatchType::Unmatched
1045        );
1046
1047        // Different length
1048        let query_contig2 = Contig::new("chr1", 2000);
1049        assert_eq!(
1050            classify_contig_match(&query_contig2, &reference),
1051            ContigMatchType::Unmatched
1052        );
1053    }
1054}