1use std::collections::{HashMap, HashSet};
2
3use crate::core::contig::Contig;
4use crate::core::header::QueryHeader;
5use crate::core::reference::KnownReference;
6use crate::core::types::MatchType;
7
8#[derive(Debug, Clone)]
10pub struct MatchDiagnosis {
11 pub match_type: MatchType,
13
14 pub exact_matches: Vec<ContigMatch>,
16
17 pub renamed_matches: Vec<RenamedContig>,
19
20 pub query_only: Vec<Contig>,
22
23 pub reordered: bool,
25
26 pub conflicts: Vec<ContigConflict>,
28
29 pub suggestions: Vec<Suggestion>,
31}
32
33#[derive(Debug, Clone)]
34pub struct ContigMatch;
35
36#[derive(Debug, Clone)]
37pub struct RenamedContig {
38 pub query_name: String,
39 pub reference_name: String,
40}
41
42#[derive(Debug, Clone)]
43pub struct ContigConflict {
44 pub query_contig: Contig,
45 pub expected: Option<Contig>,
46 pub conflict_type: ConflictType,
47 pub description: String,
48}
49
50#[derive(Debug, Clone)]
51pub enum ConflictType {
52 SequenceMismatch,
54 MitochondrialMismatch,
56 UnknownContig,
58}
59
60#[derive(Debug, Clone)]
61pub enum Suggestion {
62 RenameContigs { command_hint: String },
64 ReorderContigs { command_hint: String },
66 ReplaceContig {
68 contig_name: String,
69 reason: String,
70 source: String,
71 },
72 UseAsIs { warnings: Vec<String> },
74 Realign {
76 reason: String,
77 suggested_reference: String,
78 },
79}
80
81impl MatchDiagnosis {
82 #[must_use]
83 #[allow(clippy::too_many_lines)] pub fn analyze(query: &QueryHeader, reference: &KnownReference) -> Self {
85 let mut exact_matches = Vec::new();
86 let mut renamed_matches = Vec::new();
87 let mut name_length_only_matches = Vec::new();
88 let mut query_only = Vec::new();
89 let mut conflicts = Vec::new();
90
91 let ref_by_md5: HashMap<&str, &Contig> = reference
93 .contigs
94 .iter()
95 .filter_map(|c| c.md5.as_ref().map(|m| (m.as_str(), c)))
96 .collect();
97
98 let mut ref_by_name_length: HashMap<(String, u64), &Contig> = HashMap::new();
101 for contig in &reference.contigs {
102 ref_by_name_length.insert((contig.name.clone(), contig.length), contig);
104 for alias in &contig.aliases {
106 ref_by_name_length.insert((alias.clone(), contig.length), contig);
107 }
108 }
109
110 let mut matched_ref_md5s: HashSet<&str> = HashSet::new();
111 let mut matched_ref_name_lengths: HashSet<(String, u64)> = HashSet::new();
112
113 for q_contig in &query.contigs {
115 let q_key = (q_contig.name.clone(), q_contig.length);
116
117 if let Some(q_md5) = &q_contig.md5 {
119 if let Some(r_contig) = ref_by_md5.get(q_md5.as_str()) {
120 matched_ref_md5s.insert(q_md5.as_str());
121
122 if q_contig.name == r_contig.name {
123 exact_matches.push(ContigMatch);
125 } else {
126 renamed_matches.push(RenamedContig {
128 query_name: q_contig.name.clone(),
129 reference_name: r_contig.name.clone(),
130 });
131 }
132 continue;
133 }
134 }
135
136 let matched_ref = ref_by_name_length.get(&q_key).copied().or_else(|| {
138 q_contig.aliases.iter().find_map(|alias| {
140 let alias_key = (alias.clone(), q_contig.length);
141 ref_by_name_length.get(&alias_key).copied()
142 })
143 });
144
145 if let Some(r_contig) = matched_ref {
146 let matched_key = (r_contig.name.clone(), r_contig.length);
147 matched_ref_name_lengths.insert(matched_key);
148
149 if let (Some(q_md5), Some(r_md5)) = (&q_contig.md5, &r_contig.md5) {
151 if q_md5 != r_md5 {
152 let conflict_type = if q_contig.is_mitochondrial() {
154 ConflictType::MitochondrialMismatch
155 } else {
156 ConflictType::SequenceMismatch
157 };
158
159 conflicts.push(ContigConflict {
160 query_contig: q_contig.clone(),
161 expected: Some(r_contig.clone()),
162 conflict_type,
163 description: format!(
164 "Contig {} has same name/length but different MD5 (query: {}, ref: {})",
165 q_contig.name, q_md5, r_md5
166 ),
167 });
168 continue;
169 }
170 }
171
172 let query_names_match_ref_primary = q_contig.name == r_contig.name
178 || q_contig.aliases.iter().any(|a| a == &r_contig.name);
179 let query_names_match_ref_alias = r_contig.aliases.contains(&q_contig.name)
180 || q_contig
181 .aliases
182 .iter()
183 .any(|qa| r_contig.aliases.contains(qa));
184 let is_name_match = query_names_match_ref_primary || query_names_match_ref_alias;
185
186 if is_name_match {
187 name_length_only_matches.push(ContigMatch);
188 } else {
189 renamed_matches.push(RenamedContig {
191 query_name: q_contig.name.clone(),
192 reference_name: r_contig.name.clone(),
193 });
194 }
195 continue;
196 }
197
198 let conflict_type = if q_contig.is_mitochondrial() {
200 ConflictType::MitochondrialMismatch
202 } else {
203 ConflictType::UnknownContig
204 };
205
206 if q_contig.is_primary_chromosome() || q_contig.is_mitochondrial() {
207 conflicts.push(ContigConflict {
208 query_contig: q_contig.clone(),
209 expected: None,
210 conflict_type,
211 description: format!(
212 "No match found for {} (length: {})",
213 q_contig.name, q_contig.length
214 ),
215 });
216 } else {
217 query_only.push(q_contig.clone());
218 }
219 }
220
221 let reordered = !super::scoring::MatchScore::calculate(query, reference).order_preserved;
223
224 let match_type = determine_match_type(
225 &exact_matches,
226 &renamed_matches,
227 &name_length_only_matches,
228 &query_only,
229 &conflicts,
230 reordered,
231 );
232
233 let suggestions = generate_suggestions(
235 &match_type,
236 &renamed_matches,
237 &conflicts,
238 reordered,
239 reference,
240 );
241
242 Self {
243 match_type,
244 exact_matches,
245 renamed_matches,
246 query_only,
247 reordered,
248 conflicts,
249 suggestions,
250 }
251 }
252}
253
254fn determine_match_type(
255 exact_matches: &[ContigMatch],
256 renamed_matches: &[RenamedContig],
257 name_length_only: &[ContigMatch],
258 query_only: &[Contig],
259 conflicts: &[ContigConflict],
260 reordered: bool,
261) -> MatchType {
262 let total_matched = exact_matches.len() + renamed_matches.len() + name_length_only.len();
263
264 if total_matched == 0 {
266 return MatchType::NoMatch;
267 }
268
269 if !exact_matches.is_empty()
271 && renamed_matches.is_empty()
272 && conflicts.is_empty()
273 && query_only.is_empty()
274 {
275 if reordered {
276 return MatchType::Reordered;
277 }
278 return MatchType::Exact;
279 }
280
281 if !renamed_matches.is_empty() && conflicts.is_empty() && query_only.is_empty() {
283 if reordered {
284 return MatchType::ReorderedAndRenamed;
285 }
286 return MatchType::Renamed;
287 }
288
289 if !conflicts.is_empty() {
291 let has_mito_conflict = conflicts
293 .iter()
294 .any(|c| matches!(c.conflict_type, ConflictType::MitochondrialMismatch));
295
296 if has_mito_conflict && total_matched > conflicts.len() * 2 {
297 return MatchType::Mixed;
298 }
299 }
300
301 MatchType::Partial
302}
303
304fn generate_suggestions(
305 match_type: &MatchType,
306 renamed_matches: &[RenamedContig],
307 conflicts: &[ContigConflict],
308 reordered: bool,
309 reference: &KnownReference,
310) -> Vec<Suggestion> {
311 let mut suggestions = Vec::new();
312
313 if !renamed_matches.is_empty() {
315 suggestions.push(Suggestion::RenameContigs {
316 command_hint: "fgbio UpdateSequenceDictionary --in input.bam --out output.bam \\\n \
317 --sequence-dictionary reference.dict"
318 .to_string(),
319 });
320 }
321
322 if reordered {
324 suggestions.push(Suggestion::ReorderContigs {
325 command_hint: "picard ReorderSam \\\n \
326 I=input.bam \\\n \
327 O=output.bam \\\n \
328 REFERENCE=reference.fa"
329 .to_string(),
330 });
331 }
332
333 for conflict in conflicts {
335 match &conflict.conflict_type {
336 ConflictType::MitochondrialMismatch => {
337 suggestions.push(Suggestion::ReplaceContig {
338 contig_name: conflict.query_contig.name.clone(),
339 reason: format!(
340 "Mitochondrial sequence differs: {} ({}bp) vs expected ({}bp)",
341 conflict.query_contig.name,
342 conflict.query_contig.length,
343 conflict.expected.as_ref().map_or(0, |c| c.length)
344 ),
345 source: reference.download_url.clone().unwrap_or_default(),
346 });
347 }
348 ConflictType::SequenceMismatch => {
349 suggestions.push(Suggestion::Realign {
350 reason: format!(
351 "Contig {} has different sequence than expected",
352 conflict.query_contig.name
353 ),
354 suggested_reference: reference.id.to_string(),
355 });
356 }
357 ConflictType::UnknownContig => {
358 }
360 }
361 }
362
363 if matches!(match_type, MatchType::Exact) && conflicts.is_empty() {
365 suggestions.push(Suggestion::UseAsIs {
366 warnings: Vec::new(),
367 });
368 }
369
370 suggestions
371}