Skip to main content

ref_solver/matching/
diagnosis.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::core::contig::Contig;
4use crate::core::header::QueryHeader;
5use crate::core::reference::KnownReference;
6use crate::core::types::MatchType;
7
8/// Detailed diagnosis of differences between query and reference
9#[derive(Debug, Clone)]
10pub struct MatchDiagnosis {
11    /// Type of match
12    pub match_type: MatchType,
13
14    /// Query contigs that match reference exactly
15    pub exact_matches: Vec<ContigMatch>,
16
17    /// Query contigs that match by MD5 but have different names
18    pub renamed_matches: Vec<RenamedContig>,
19
20    /// Query contigs with no match in reference
21    pub query_only: Vec<Contig>,
22
23    /// Contigs that are in a different order
24    pub reordered: bool,
25
26    /// Potential source conflicts (e.g., mito from different build)
27    pub conflicts: Vec<ContigConflict>,
28
29    /// Suggested fixes
30    pub suggestions: Vec<Suggestion>,
31}
32
33#[derive(Debug, Clone)]
34pub struct ContigMatch;
35
36#[derive(Debug, Clone)]
37pub struct RenamedContig {
38    pub query_name: String,
39    pub reference_name: String,
40}
41
42#[derive(Debug, Clone)]
43pub struct ContigConflict {
44    pub query_contig: Contig,
45    pub expected: Option<Contig>,
46    pub conflict_type: ConflictType,
47    pub description: String,
48}
49
50#[derive(Debug, Clone)]
51pub enum ConflictType {
52    /// Same name, different sequence (different MD5/length)
53    SequenceMismatch,
54    /// Mitochondrial from different build
55    MitochondrialMismatch,
56    /// Unknown contig not in reference
57    UnknownContig,
58}
59
60#[derive(Debug, Clone)]
61pub enum Suggestion {
62    /// Rename contigs using fgbio or similar
63    RenameContigs { command_hint: String },
64    /// Reorder contigs to match reference
65    ReorderContigs { command_hint: String },
66    /// Replace a specific contig (e.g., mito)
67    ReplaceContig {
68        contig_name: String,
69        reason: String,
70        source: String,
71    },
72    /// Reference is a close match, safe to use
73    UseAsIs { warnings: Vec<String> },
74    /// Need to realign with different reference
75    Realign {
76        reason: String,
77        suggested_reference: String,
78    },
79}
80
81impl MatchDiagnosis {
82    #[must_use]
83    #[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
84    pub fn analyze(query: &QueryHeader, reference: &KnownReference) -> Self {
85        let mut exact_matches = Vec::new();
86        let mut renamed_matches = Vec::new();
87        let mut name_length_only_matches = Vec::new();
88        let mut query_only = Vec::new();
89        let mut conflicts = Vec::new();
90
91        // Build lookup maps for reference
92        let ref_by_md5: HashMap<&str, &Contig> = reference
93            .contigs
94            .iter()
95            .filter_map(|c| c.md5.as_ref().map(|m| (m.as_str(), c)))
96            .collect();
97
98        // Use exact names for matching (no normalization)
99        // Index by both primary name AND aliases for proper alias-based matching
100        let mut ref_by_name_length: HashMap<(String, u64), &Contig> = HashMap::new();
101        for contig in &reference.contigs {
102            // Primary name
103            ref_by_name_length.insert((contig.name.clone(), contig.length), contig);
104            // Also index by aliases so query names can match ref aliases
105            for alias in &contig.aliases {
106                ref_by_name_length.insert((alias.clone(), contig.length), contig);
107            }
108        }
109
110        let mut matched_ref_md5s: HashSet<&str> = HashSet::new();
111        let mut matched_ref_name_lengths: HashSet<(String, u64)> = HashSet::new();
112
113        // Analyze each query contig using exact names
114        for q_contig in &query.contigs {
115            let q_key = (q_contig.name.clone(), q_contig.length);
116
117            // Try MD5 match first
118            if let Some(q_md5) = &q_contig.md5 {
119                if let Some(r_contig) = ref_by_md5.get(q_md5.as_str()) {
120                    matched_ref_md5s.insert(q_md5.as_str());
121
122                    if q_contig.name == r_contig.name {
123                        // Exact match
124                        exact_matches.push(ContigMatch);
125                    } else {
126                        // Same sequence, different name
127                        renamed_matches.push(RenamedContig {
128                            query_name: q_contig.name.clone(),
129                            reference_name: r_contig.name.clone(),
130                        });
131                    }
132                    continue;
133                }
134            }
135
136            // Try name+length match (direct name or via alias)
137            let matched_ref = ref_by_name_length.get(&q_key).copied().or_else(|| {
138                // Try matching via query aliases (reverse alias matching)
139                q_contig.aliases.iter().find_map(|alias| {
140                    let alias_key = (alias.clone(), q_contig.length);
141                    ref_by_name_length.get(&alias_key).copied()
142                })
143            });
144
145            if let Some(r_contig) = matched_ref {
146                let matched_key = (r_contig.name.clone(), r_contig.length);
147                matched_ref_name_lengths.insert(matched_key);
148
149                // Check if this might be a conflict (same position but different sequence)
150                if let (Some(q_md5), Some(r_md5)) = (&q_contig.md5, &r_contig.md5) {
151                    if q_md5 != r_md5 {
152                        // Different sequence!
153                        let conflict_type = if q_contig.is_mitochondrial() {
154                            ConflictType::MitochondrialMismatch
155                        } else {
156                            ConflictType::SequenceMismatch
157                        };
158
159                        conflicts.push(ContigConflict {
160                            query_contig: q_contig.clone(),
161                            expected: Some(r_contig.clone()),
162                            conflict_type,
163                            description: format!(
164                                "Contig {} has same name/length but different MD5 (query: {}, ref: {})",
165                                q_contig.name, q_md5, r_md5
166                            ),
167                        });
168                        continue;
169                    }
170                }
171
172                // Check all 4 combinations - any match via name or alias should be treated equally:
173                // 1. Query primary == Ref primary
174                // 2. Query primary in Ref aliases
175                // 3. Query alias == Ref primary
176                // 4. Query alias in Ref aliases
177                let query_names_match_ref_primary = q_contig.name == r_contig.name
178                    || q_contig.aliases.iter().any(|a| a == &r_contig.name);
179                let query_names_match_ref_alias = r_contig.aliases.contains(&q_contig.name)
180                    || q_contig
181                        .aliases
182                        .iter()
183                        .any(|qa| r_contig.aliases.contains(qa));
184                let is_name_match = query_names_match_ref_primary || query_names_match_ref_alias;
185
186                if is_name_match {
187                    name_length_only_matches.push(ContigMatch);
188                } else {
189                    // Only count as renamed if no name/alias overlap at all
190                    renamed_matches.push(RenamedContig {
191                        query_name: q_contig.name.clone(),
192                        reference_name: r_contig.name.clone(),
193                    });
194                }
195                continue;
196            }
197
198            // No match found
199            let conflict_type = if q_contig.is_mitochondrial() {
200                // Special handling for mitochondrial
201                ConflictType::MitochondrialMismatch
202            } else {
203                ConflictType::UnknownContig
204            };
205
206            if q_contig.is_primary_chromosome() || q_contig.is_mitochondrial() {
207                conflicts.push(ContigConflict {
208                    query_contig: q_contig.clone(),
209                    expected: None,
210                    conflict_type,
211                    description: format!(
212                        "No match found for {} (length: {})",
213                        q_contig.name, q_contig.length
214                    ),
215                });
216            } else {
217                query_only.push(q_contig.clone());
218            }
219        }
220
221        // Determine match type and reordering
222        let reordered = !super::scoring::MatchScore::calculate(query, reference).order_preserved;
223
224        let match_type = determine_match_type(
225            &exact_matches,
226            &renamed_matches,
227            &name_length_only_matches,
228            &query_only,
229            &conflicts,
230            reordered,
231        );
232
233        // Generate suggestions
234        let suggestions = generate_suggestions(
235            &match_type,
236            &renamed_matches,
237            &conflicts,
238            reordered,
239            reference,
240        );
241
242        Self {
243            match_type,
244            exact_matches,
245            renamed_matches,
246            query_only,
247            reordered,
248            conflicts,
249            suggestions,
250        }
251    }
252}
253
254fn determine_match_type(
255    exact_matches: &[ContigMatch],
256    renamed_matches: &[RenamedContig],
257    name_length_only: &[ContigMatch],
258    query_only: &[Contig],
259    conflicts: &[ContigConflict],
260    reordered: bool,
261) -> MatchType {
262    let total_matched = exact_matches.len() + renamed_matches.len() + name_length_only.len();
263
264    // No matches at all
265    if total_matched == 0 {
266        return MatchType::NoMatch;
267    }
268
269    // All exact matches
270    if !exact_matches.is_empty()
271        && renamed_matches.is_empty()
272        && conflicts.is_empty()
273        && query_only.is_empty()
274    {
275        if reordered {
276            return MatchType::Reordered;
277        }
278        return MatchType::Exact;
279    }
280
281    // All matches but need renaming
282    if !renamed_matches.is_empty() && conflicts.is_empty() && query_only.is_empty() {
283        if reordered {
284            return MatchType::ReorderedAndRenamed;
285        }
286        return MatchType::Renamed;
287    }
288
289    // Has conflicts or unmatched - partial or mixed
290    if !conflicts.is_empty() {
291        // Check if conflicts suggest mixed sources
292        let has_mito_conflict = conflicts
293            .iter()
294            .any(|c| matches!(c.conflict_type, ConflictType::MitochondrialMismatch));
295
296        if has_mito_conflict && total_matched > conflicts.len() * 2 {
297            return MatchType::Mixed;
298        }
299    }
300
301    MatchType::Partial
302}
303
304fn generate_suggestions(
305    match_type: &MatchType,
306    renamed_matches: &[RenamedContig],
307    conflicts: &[ContigConflict],
308    reordered: bool,
309    reference: &KnownReference,
310) -> Vec<Suggestion> {
311    let mut suggestions = Vec::new();
312
313    // Renaming suggestion
314    if !renamed_matches.is_empty() {
315        suggestions.push(Suggestion::RenameContigs {
316            command_hint: "fgbio UpdateSequenceDictionary --in input.bam --out output.bam \\\n  \
317                 --sequence-dictionary reference.dict"
318                .to_string(),
319        });
320    }
321
322    // Reordering suggestion
323    if reordered {
324        suggestions.push(Suggestion::ReorderContigs {
325            command_hint: "picard ReorderSam \\\n  \
326                 I=input.bam \\\n  \
327                 O=output.bam \\\n  \
328                 REFERENCE=reference.fa"
329                .to_string(),
330        });
331    }
332
333    // Conflict-specific suggestions
334    for conflict in conflicts {
335        match &conflict.conflict_type {
336            ConflictType::MitochondrialMismatch => {
337                suggestions.push(Suggestion::ReplaceContig {
338                    contig_name: conflict.query_contig.name.clone(),
339                    reason: format!(
340                        "Mitochondrial sequence differs: {} ({}bp) vs expected ({}bp)",
341                        conflict.query_contig.name,
342                        conflict.query_contig.length,
343                        conflict.expected.as_ref().map_or(0, |c| c.length)
344                    ),
345                    source: reference.download_url.clone().unwrap_or_default(),
346                });
347            }
348            ConflictType::SequenceMismatch => {
349                suggestions.push(Suggestion::Realign {
350                    reason: format!(
351                        "Contig {} has different sequence than expected",
352                        conflict.query_contig.name
353                    ),
354                    suggested_reference: reference.id.to_string(),
355                });
356            }
357            ConflictType::UnknownContig => {
358                // Usually fine - just extra contigs
359            }
360        }
361    }
362
363    // Safe to use suggestion
364    if matches!(match_type, MatchType::Exact) && conflicts.is_empty() {
365        suggestions.push(Suggestion::UseAsIs {
366            warnings: Vec::new(),
367        });
368    }
369
370    suggestions
371}