Skip to main content

ref_solver/parsing/
sam.rs

1use std::io::BufReader;
2use std::path::Path;
3use thiserror::Error;
4use tracing::warn;
5
6use crate::core::contig::Contig;
7use crate::core::header::QueryHeader;
8use crate::utils::validation::{check_contig_limit, normalize_md5};
9
10#[derive(Error, Debug)]
11pub enum ParseError {
12    #[error("IO error: {0}")]
13    Io(#[from] std::io::Error),
14
15    #[error("Invalid SAM header format: {0}")]
16    InvalidFormat(String),
17
18    #[error("noodles error: {0}")]
19    Noodles(String),
20
21    #[error("Unsupported file format: {0}")]
22    UnsupportedFormat(String),
23
24    #[error("Too many contigs: {0} exceeds maximum allowed (100000)")]
25    TooManyContigs(usize),
26}
27
28/// Parse a SAM/BAM/CRAM file and extract the header
29///
30/// # Errors
31///
32/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
33/// parsing fails, `ParseError::UnsupportedFormat` for unknown extensions,
34/// `ParseError::InvalidFormat` if no contigs are found, or
35/// `ParseError::TooManyContigs` if the limit is exceeded.
36pub fn parse_file(path: &Path) -> Result<QueryHeader, ParseError> {
37    let extension = path
38        .extension()
39        .and_then(|e| e.to_str())
40        .map(str::to_lowercase);
41
42    match extension.as_deref() {
43        Some("sam") => parse_sam_file(path),
44        Some("bam") => parse_bam_file(path),
45        Some("cram") => parse_cram_file(path),
46        Some(ext) => Err(ParseError::UnsupportedFormat(ext.to_string())),
47        None => {
48            // Try to detect from content - default to SAM
49            parse_sam_file(path)
50        }
51    }
52}
53
54/// Parse a SAM file (text format)
55fn parse_sam_file(path: &Path) -> Result<QueryHeader, ParseError> {
56    use noodles::sam;
57
58    let mut reader = std::fs::File::open(path)
59        .map(BufReader::new)
60        .map(sam::io::Reader::new)?;
61
62    let header = reader
63        .read_header()
64        .map_err(|e| ParseError::Noodles(e.to_string()))?;
65
66    header_to_query(&header, Some(path))
67}
68
69/// Parse a BAM file (binary format)
70fn parse_bam_file(path: &Path) -> Result<QueryHeader, ParseError> {
71    use noodles::bam;
72
73    let mut reader = std::fs::File::open(path).map(bam::io::Reader::new)?;
74
75    let header = reader
76        .read_header()
77        .map_err(|e| ParseError::Noodles(e.to_string()))?;
78
79    header_to_query(&header, Some(path))
80}
81
82/// Parse a CRAM file
83fn parse_cram_file(path: &Path) -> Result<QueryHeader, ParseError> {
84    use noodles::cram;
85
86    let mut reader = std::fs::File::open(path).map(cram::io::Reader::new)?;
87
88    // Read file definition
89    reader
90        .read_file_definition()
91        .map_err(|e| ParseError::Noodles(e.to_string()))?;
92
93    let header = reader
94        .read_file_header()
95        .map_err(|e| ParseError::Noodles(e.to_string()))?;
96
97    header_to_query(&header, Some(path))
98}
99
100/// Convert noodles header to `QueryHeader`
101fn header_to_query(
102    header: &noodles::sam::Header,
103    source: Option<&Path>,
104) -> Result<QueryHeader, ParseError> {
105    use noodles::sam::header::record::value::map::tag::Other;
106
107    let mut contigs = Vec::new();
108
109    for (name, map) in header.reference_sequences() {
110        let name_str = name.to_string();
111        let length = map.length().get() as u64;
112
113        let mut contig = Contig::new(name_str, length);
114
115        // Extract M5 (MD5) tag from other_fields
116        // The M5 tag contains the MD5 checksum of the sequence
117        if let Ok(m5_tag) = Other::try_from(*b"M5") {
118            if let Some(md5_value) = map.other_fields().get(&m5_tag) {
119                let md5_str = md5_value.to_string();
120                // Validate and normalize MD5 using centralized helper
121                if let Some(normalized) = normalize_md5(&md5_str) {
122                    contig.md5 = Some(normalized);
123                } else {
124                    warn!(
125                        contig = %contig.name,
126                        md5 = %md5_str,
127                        "Invalid MD5 checksum format, ignoring"
128                    );
129                }
130            }
131        }
132
133        // Extract AS (Assembly) tag
134        if let Ok(as_tag) = Other::try_from(*b"AS") {
135            if let Some(assembly_value) = map.other_fields().get(&as_tag) {
136                contig.assembly = Some(assembly_value.to_string());
137            }
138        }
139
140        // Extract UR (URI) tag
141        if let Ok(ur_tag) = Other::try_from(*b"UR") {
142            if let Some(uri_value) = map.other_fields().get(&ur_tag) {
143                contig.uri = Some(uri_value.to_string());
144            }
145        }
146
147        // Extract SP (Species) tag
148        if let Ok(sp_tag) = Other::try_from(*b"SP") {
149            if let Some(species_value) = map.other_fields().get(&sp_tag) {
150                contig.species = Some(species_value.to_string());
151            }
152        }
153
154        // Extract AN (Alternate Names) tag - comma-separated list of aliases
155        if let Ok(an_tag) = Other::try_from(*b"AN") {
156            if let Some(aliases_value) = map.other_fields().get(&an_tag) {
157                let aliases: Vec<String> = aliases_value
158                    .to_string()
159                    .split(',')
160                    .map(|s| s.trim().to_string())
161                    .filter(|s| !s.is_empty())
162                    .collect();
163                if !aliases.is_empty() {
164                    contig.aliases = aliases;
165                }
166            }
167        }
168
169        // Check contig limit for DOS protection
170        if check_contig_limit(contigs.len()).is_some() {
171            return Err(ParseError::TooManyContigs(contigs.len()));
172        }
173
174        contigs.push(contig);
175    }
176
177    let mut query = QueryHeader::new(contigs);
178    if let Some(path) = source {
179        query = query.with_source(path.display().to_string());
180    }
181
182    Ok(query)
183}
184
185/// Parse header from raw text (stdin or pasted)
186///
187/// # Errors
188///
189/// Returns `ParseError::InvalidFormat` if the text has invalid format, missing
190/// required fields, or no contigs are found, or `ParseError::TooManyContigs`
191/// if the limit is exceeded.
192pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
193    let mut contigs = Vec::new();
194
195    for line in text.lines() {
196        if !line.starts_with("@SQ") {
197            continue;
198        }
199
200        let mut name: Option<String> = None;
201        let mut length: Option<u64> = None;
202        let mut md5_raw: Option<String> = None;
203        let mut assembly: Option<String> = None;
204        let mut uri: Option<String> = None;
205        let mut species: Option<String> = None;
206        let mut aliases: Vec<String> = Vec::new();
207
208        for field in line.split('\t').skip(1) {
209            if let Some((tag, value)) = field.split_once(':') {
210                match tag {
211                    "SN" => name = Some(value.to_string()),
212                    "LN" => length = value.parse().ok(),
213                    "M5" => md5_raw = Some(value.to_string()),
214                    "AS" => assembly = Some(value.to_string()),
215                    "UR" => uri = Some(value.to_string()),
216                    "SP" => species = Some(value.to_string()),
217                    "AN" => {
218                        // Alternate names (aliases), comma-separated
219                        aliases = value
220                            .split(',')
221                            .map(|s| s.trim().to_string())
222                            .filter(|s| !s.is_empty())
223                            .collect();
224                    }
225                    _ => {}
226                }
227            }
228        }
229
230        if let (Some(ref name_str), Some(length)) = (&name, length) {
231            // Check contig limit for DOS protection
232            if check_contig_limit(contigs.len()).is_some() {
233                return Err(ParseError::TooManyContigs(contigs.len()));
234            }
235
236            // Validate and normalize MD5, warn if invalid
237            let md5 = if let Some(ref raw) = md5_raw {
238                if let Some(normalized) = normalize_md5(raw) {
239                    Some(normalized)
240                } else {
241                    warn!(
242                        contig = %name_str,
243                        md5 = %raw,
244                        "Invalid MD5 checksum format, ignoring"
245                    );
246                    None
247                }
248            } else {
249                None
250            };
251
252            let mut contig = Contig::new(name_str.clone(), length);
253            contig.md5 = md5;
254            contig.assembly = assembly;
255            contig.uri = uri;
256            contig.species = species;
257            contig.aliases = aliases;
258            contigs.push(contig);
259        }
260    }
261
262    if contigs.is_empty() {
263        return Err(ParseError::InvalidFormat(
264            "No @SQ lines found in header".to_string(),
265        ));
266    }
267
268    Ok(QueryHeader::new(contigs))
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn test_parse_header_text() {
277        let header = r"@HD	VN:1.6	SO:coordinate
278@SQ	SN:chr1	LN:248956422	M5:6aef897c3d6ff0c78aff06ac189178dd
279@SQ	SN:chr2	LN:242193529	M5:f98db672eb0993dcfdabafe2a882905c
280@SQ	SN:chrM	LN:16569
281@RG	ID:sample1
282";
283
284        let query = parse_header_text(header).unwrap();
285        assert_eq!(query.contigs.len(), 3);
286
287        assert_eq!(query.contigs[0].name, "chr1");
288        assert_eq!(query.contigs[0].length, 248_956_422);
289        assert_eq!(
290            query.contigs[0].md5,
291            Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
292        );
293
294        assert_eq!(query.contigs[1].name, "chr2");
295        assert_eq!(query.contigs[2].name, "chrM");
296        assert!(query.contigs[2].md5.is_none());
297    }
298
299    #[test]
300    fn test_parse_header_text_no_sq() {
301        let header = "@HD\tVN:1.6\n@RG\tID:sample1\n";
302        let result = parse_header_text(header);
303        assert!(result.is_err());
304    }
305
306    #[test]
307    fn test_parse_header_text_with_aliases() {
308        let header = r"@HD	VN:1.6
309@SQ	SN:chr1	LN:248956422	M5:6aef897c3d6ff0c78aff06ac189178dd	AN:1,NC_000001.11
310@SQ	SN:chrM	LN:16569	AN:MT,chrMT,NC_012920.1
311";
312
313        let query = parse_header_text(header).unwrap();
314        assert_eq!(query.contigs.len(), 2);
315
316        // Check chr1 aliases
317        assert_eq!(query.contigs[0].name, "chr1");
318        assert_eq!(
319            query.contigs[0].aliases,
320            vec!["1".to_string(), "NC_000001.11".to_string()]
321        );
322
323        // Check chrM aliases
324        assert_eq!(query.contigs[1].name, "chrM");
325        assert_eq!(
326            query.contigs[1].aliases,
327            vec![
328                "MT".to_string(),
329                "chrMT".to_string(),
330                "NC_012920.1".to_string()
331            ]
332        );
333    }
334}