Skip to main content

ref_solver/core/
header.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashSet;
3
4use crate::core::contig::{detect_naming_convention, Contig};
5use crate::core::types::NamingConvention;
6use crate::utils::validation::compute_signature;
7
8/// Helper function to convert usize count to f64 with explicit precision loss allowance
9#[inline]
10fn count_to_f64(count: usize) -> f64 {
11    #[allow(clippy::cast_precision_loss)]
12    {
13        count as f64
14    }
15}
16
17/// A query header extracted from a BAM/SAM/CRAM file
18#[derive(Debug, Clone, Serialize, Deserialize)]
19#[non_exhaustive]
20pub struct QueryHeader {
21    /// Source file path (if known)
22    #[serde(default, skip_serializing_if = "Option::is_none")]
23    pub source: Option<String>,
24
25    /// All contigs from @SQ lines
26    pub contigs: Vec<Contig>,
27
28    /// Detected naming convention
29    pub naming_convention: NamingConvention,
30
31    // === Pre-computed for matching ===
32    /// Set of MD5s present in header
33    #[serde(skip)]
34    pub md5_set: HashSet<String>,
35
36    /// Set of sha512t24u digests present in header
37    #[serde(skip)]
38    pub sha512t24u_set: HashSet<String>,
39
40    /// Set of (`exact_name`, length) pairs for matching
41    #[serde(skip)]
42    pub name_length_set: HashSet<(String, u64)>,
43
44    /// Set of (alias, length) pairs for alias-based matching
45    #[serde(skip)]
46    pub alias_length_set: HashSet<(String, u64)>,
47
48    /// Signature for exact matching
49    #[serde(skip)]
50    pub signature: Option<String>,
51}
52
53impl QueryHeader {
54    #[must_use]
55    pub fn new(contigs: Vec<Contig>) -> Self {
56        let naming_convention = detect_naming_convention(&contigs);
57
58        let mut header = Self {
59            source: None,
60            contigs,
61            naming_convention,
62            md5_set: HashSet::new(),
63            sha512t24u_set: HashSet::new(),
64            name_length_set: HashSet::new(),
65            alias_length_set: HashSet::new(),
66            signature: None,
67        };
68
69        header.rebuild_indexes();
70        header
71    }
72
73    #[must_use]
74    pub fn with_source(mut self, source: impl Into<String>) -> Self {
75        self.source = Some(source.into());
76        self
77    }
78
79    pub fn rebuild_indexes(&mut self) {
80        self.md5_set.clear();
81        self.sha512t24u_set.clear();
82        self.name_length_set.clear();
83        self.alias_length_set.clear();
84
85        for contig in &self.contigs {
86            if let Some(md5) = &contig.md5 {
87                self.md5_set.insert(md5.clone());
88            }
89            if let Some(digest) = &contig.sha512t24u {
90                self.sha512t24u_set.insert(digest.clone());
91            }
92            // Use exact name for matching (no normalization)
93            self.name_length_set
94                .insert((contig.name.clone(), contig.length));
95
96            // Also index aliases for reverse matching (query alias -> catalog name)
97            for alias in &contig.aliases {
98                self.alias_length_set.insert((alias.clone(), contig.length));
99            }
100        }
101
102        // Compute signature using centralized helper
103        let sig = compute_signature(&self.md5_set);
104        if !sig.is_empty() {
105            self.signature = Some(sig);
106        }
107    }
108
109    /// Check if header has MD5 information
110    #[must_use]
111    pub fn has_md5s(&self) -> bool {
112        !self.md5_set.is_empty()
113    }
114
115    /// Fraction of contigs with MD5 checksums
116    #[must_use]
117    pub fn md5_coverage(&self) -> f64 {
118        if self.contigs.is_empty() {
119            return 0.0;
120        }
121        count_to_f64(self.contigs.iter().filter(|c| c.md5.is_some()).count())
122            / count_to_f64(self.contigs.len())
123    }
124
125    /// Get only primary chromosomes (1-22, X, Y, MT)
126    #[cfg(test)]
127    #[must_use]
128    pub fn primary_contigs(&self) -> Vec<&Contig> {
129        self.contigs
130            .iter()
131            .filter(|c| c.is_primary_chromosome() || c.is_mitochondrial())
132            .collect()
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn test_query_header_new() {
142        let contigs = vec![Contig::new("chr1", 100), Contig::new("chr2", 200)];
143        let header = QueryHeader::new(contigs);
144
145        assert_eq!(header.contigs.len(), 2);
146        assert_eq!(header.naming_convention, NamingConvention::Ucsc);
147    }
148
149    #[test]
150    fn test_query_header_md5_set() {
151        let contigs = vec![
152            Contig::new("chr1", 100).with_md5("abc123"),
153            Contig::new("chr2", 200).with_md5("def456"),
154        ];
155        let header = QueryHeader::new(contigs);
156
157        assert!(header.has_md5s());
158        assert_eq!(header.md5_set.len(), 2);
159        assert!(header.md5_set.contains("abc123"));
160        assert!(header.md5_set.contains("def456"));
161    }
162
163    #[test]
164    fn test_query_header_no_md5() {
165        let contigs = vec![Contig::new("chr1", 100), Contig::new("chr2", 200)];
166        let header = QueryHeader::new(contigs);
167
168        assert!(!header.has_md5s());
169        assert!(header.md5_set.is_empty());
170        assert!(header.signature.is_none());
171    }
172
173    #[test]
174    fn test_md5_coverage() {
175        let contigs = vec![
176            Contig::new("chr1", 100).with_md5("abc123"),
177            Contig::new("chr2", 200),
178            Contig::new("chr3", 300).with_md5("ghi789"),
179            Contig::new("chr4", 400),
180        ];
181        let header = QueryHeader::new(contigs);
182
183        assert!((header.md5_coverage() - 0.5).abs() < 0.01);
184    }
185
186    #[test]
187    fn test_md5_coverage_empty() {
188        let header = QueryHeader::new(vec![]);
189        assert!((header.md5_coverage() - 0.0).abs() < 0.01);
190    }
191
192    #[test]
193    fn test_md5_coverage_full() {
194        let contigs = vec![
195            Contig::new("chr1", 100).with_md5("abc123"),
196            Contig::new("chr2", 200).with_md5("def456"),
197        ];
198        let header = QueryHeader::new(contigs);
199
200        assert!((header.md5_coverage() - 1.0).abs() < 0.01);
201    }
202
203    #[test]
204    fn test_name_length_set() {
205        let contigs = vec![Contig::new("chr1", 100), Contig::new("chr2", 200)];
206        let header = QueryHeader::new(contigs);
207
208        assert_eq!(header.name_length_set.len(), 2);
209        // Names are exact (no normalization)
210        assert!(header.name_length_set.contains(&("chr1".to_string(), 100)));
211        assert!(header.name_length_set.contains(&("chr2".to_string(), 200)));
212    }
213
214    #[test]
215    fn test_signature_computed() {
216        let contigs = vec![
217            Contig::new("chr1", 100).with_md5("abc123"),
218            Contig::new("chr2", 200).with_md5("def456"),
219        ];
220        let header = QueryHeader::new(contigs);
221
222        assert!(header.signature.is_some());
223        assert_eq!(header.signature.as_ref().unwrap().len(), 32);
224    }
225
226    #[test]
227    fn test_signature_deterministic() {
228        let contigs1 = vec![
229            Contig::new("chr1", 100).with_md5("abc"),
230            Contig::new("chr2", 200).with_md5("def"),
231        ];
232        let contigs2 = vec![
233            Contig::new("chr2", 200).with_md5("def"),
234            Contig::new("chr1", 100).with_md5("abc"),
235        ];
236        let header1 = QueryHeader::new(contigs1);
237        let header2 = QueryHeader::new(contigs2);
238
239        // Same MD5s in different order should produce same signature
240        assert_eq!(header1.signature, header2.signature);
241    }
242
243    #[test]
244    fn test_with_source() {
245        let contigs = vec![Contig::new("chr1", 100)];
246        let header = QueryHeader::new(contigs).with_source("/path/to/file.bam");
247
248        assert_eq!(header.source, Some("/path/to/file.bam".to_string()));
249    }
250
251    #[test]
252    fn test_primary_contigs() {
253        let contigs = vec![
254            Contig::new("chr1", 100),
255            Contig::new("chr22", 200),
256            Contig::new("chrX", 300),
257            Contig::new("chrY", 400),
258            Contig::new("chrM", 500),
259            Contig::new("chr1_random", 600),
260            Contig::new("chrUn_gl000220", 700),
261        ];
262        let header = QueryHeader::new(contigs);
263
264        let primary = header.primary_contigs();
265        assert_eq!(primary.len(), 5); // chr1, chr22, chrX, chrY, chrM
266    }
267
268    #[test]
269    fn test_detect_ucsc_naming() {
270        let contigs = vec![Contig::new("chr1", 100), Contig::new("chr2", 200)];
271        let header = QueryHeader::new(contigs);
272        assert_eq!(header.naming_convention, NamingConvention::Ucsc);
273    }
274
275    #[test]
276    fn test_detect_ncbi_naming() {
277        let contigs = vec![Contig::new("1", 100), Contig::new("2", 200)];
278        let header = QueryHeader::new(contigs);
279        assert_eq!(header.naming_convention, NamingConvention::Ncbi);
280    }
281
282    #[test]
283    fn test_detect_mixed_naming() {
284        let contigs = vec![Contig::new("chr1", 100), Contig::new("2", 200)];
285        let header = QueryHeader::new(contigs);
286        assert_eq!(header.naming_convention, NamingConvention::Mixed);
287    }
288}