ref_solver/parsing/
sam.rs1use std::io::BufReader;
2use std::path::Path;
3use thiserror::Error;
4use tracing::warn;
5
6use crate::core::contig::Contig;
7use crate::core::header::QueryHeader;
8use crate::utils::validation::{check_contig_limit, normalize_md5};
9
10#[derive(Error, Debug)]
11pub enum ParseError {
12 #[error("IO error: {0}")]
13 Io(#[from] std::io::Error),
14
15 #[error("Invalid SAM header format: {0}")]
16 InvalidFormat(String),
17
18 #[error("noodles error: {0}")]
19 Noodles(String),
20
21 #[error("Unsupported file format: {0}")]
22 UnsupportedFormat(String),
23
24 #[error("Too many contigs: {0} exceeds maximum allowed (100000)")]
25 TooManyContigs(usize),
26}
27
28pub fn parse_file(path: &Path) -> Result<QueryHeader, ParseError> {
37 let extension = path
38 .extension()
39 .and_then(|e| e.to_str())
40 .map(str::to_lowercase);
41
42 match extension.as_deref() {
43 Some("sam") => parse_sam_file(path),
44 Some("bam") => parse_bam_file(path),
45 Some("cram") => parse_cram_file(path),
46 Some(ext) => Err(ParseError::UnsupportedFormat(ext.to_string())),
47 None => {
48 parse_sam_file(path)
50 }
51 }
52}
53
54fn parse_sam_file(path: &Path) -> Result<QueryHeader, ParseError> {
56 use noodles::sam;
57
58 let mut reader = std::fs::File::open(path)
59 .map(BufReader::new)
60 .map(sam::io::Reader::new)?;
61
62 let header = reader
63 .read_header()
64 .map_err(|e| ParseError::Noodles(e.to_string()))?;
65
66 header_to_query(&header, Some(path))
67}
68
69fn parse_bam_file(path: &Path) -> Result<QueryHeader, ParseError> {
71 use noodles::bam;
72
73 let mut reader = std::fs::File::open(path).map(bam::io::Reader::new)?;
74
75 let header = reader
76 .read_header()
77 .map_err(|e| ParseError::Noodles(e.to_string()))?;
78
79 header_to_query(&header, Some(path))
80}
81
82fn parse_cram_file(path: &Path) -> Result<QueryHeader, ParseError> {
84 use noodles::cram;
85
86 let mut reader = std::fs::File::open(path).map(cram::io::Reader::new)?;
87
88 reader
90 .read_file_definition()
91 .map_err(|e| ParseError::Noodles(e.to_string()))?;
92
93 let header = reader
94 .read_file_header()
95 .map_err(|e| ParseError::Noodles(e.to_string()))?;
96
97 header_to_query(&header, Some(path))
98}
99
100fn header_to_query(
102 header: &noodles::sam::Header,
103 source: Option<&Path>,
104) -> Result<QueryHeader, ParseError> {
105 use noodles::sam::header::record::value::map::tag::Other;
106
107 let mut contigs = Vec::new();
108
109 for (name, map) in header.reference_sequences() {
110 let name_str = name.to_string();
111 let length = map.length().get() as u64;
112
113 let mut contig = Contig::new(name_str, length);
114
115 if let Ok(m5_tag) = Other::try_from(*b"M5") {
118 if let Some(md5_value) = map.other_fields().get(&m5_tag) {
119 let md5_str = md5_value.to_string();
120 if let Some(normalized) = normalize_md5(&md5_str) {
122 contig.md5 = Some(normalized);
123 } else {
124 warn!(
125 contig = %contig.name,
126 md5 = %md5_str,
127 "Invalid MD5 checksum format, ignoring"
128 );
129 }
130 }
131 }
132
133 if let Ok(as_tag) = Other::try_from(*b"AS") {
135 if let Some(assembly_value) = map.other_fields().get(&as_tag) {
136 contig.assembly = Some(assembly_value.to_string());
137 }
138 }
139
140 if let Ok(ur_tag) = Other::try_from(*b"UR") {
142 if let Some(uri_value) = map.other_fields().get(&ur_tag) {
143 contig.uri = Some(uri_value.to_string());
144 }
145 }
146
147 if let Ok(sp_tag) = Other::try_from(*b"SP") {
149 if let Some(species_value) = map.other_fields().get(&sp_tag) {
150 contig.species = Some(species_value.to_string());
151 }
152 }
153
154 if let Ok(an_tag) = Other::try_from(*b"AN") {
156 if let Some(aliases_value) = map.other_fields().get(&an_tag) {
157 let aliases: Vec<String> = aliases_value
158 .to_string()
159 .split(',')
160 .map(|s| s.trim().to_string())
161 .filter(|s| !s.is_empty())
162 .collect();
163 if !aliases.is_empty() {
164 contig.aliases = aliases;
165 }
166 }
167 }
168
169 if check_contig_limit(contigs.len()).is_some() {
171 return Err(ParseError::TooManyContigs(contigs.len()));
172 }
173
174 contigs.push(contig);
175 }
176
177 let mut query = QueryHeader::new(contigs);
178 if let Some(path) = source {
179 query = query.with_source(path.display().to_string());
180 }
181
182 Ok(query)
183}
184
185pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
193 let mut contigs = Vec::new();
194
195 for line in text.lines() {
196 if !line.starts_with("@SQ") {
197 continue;
198 }
199
200 let mut name: Option<String> = None;
201 let mut length: Option<u64> = None;
202 let mut md5_raw: Option<String> = None;
203 let mut assembly: Option<String> = None;
204 let mut uri: Option<String> = None;
205 let mut species: Option<String> = None;
206 let mut aliases: Vec<String> = Vec::new();
207
208 for field in line.split('\t').skip(1) {
209 if let Some((tag, value)) = field.split_once(':') {
210 match tag {
211 "SN" => name = Some(value.to_string()),
212 "LN" => length = value.parse().ok(),
213 "M5" => md5_raw = Some(value.to_string()),
214 "AS" => assembly = Some(value.to_string()),
215 "UR" => uri = Some(value.to_string()),
216 "SP" => species = Some(value.to_string()),
217 "AN" => {
218 aliases = value
220 .split(',')
221 .map(|s| s.trim().to_string())
222 .filter(|s| !s.is_empty())
223 .collect();
224 }
225 _ => {}
226 }
227 }
228 }
229
230 if let (Some(ref name_str), Some(length)) = (&name, length) {
231 if check_contig_limit(contigs.len()).is_some() {
233 return Err(ParseError::TooManyContigs(contigs.len()));
234 }
235
236 let md5 = if let Some(ref raw) = md5_raw {
238 if let Some(normalized) = normalize_md5(raw) {
239 Some(normalized)
240 } else {
241 warn!(
242 contig = %name_str,
243 md5 = %raw,
244 "Invalid MD5 checksum format, ignoring"
245 );
246 None
247 }
248 } else {
249 None
250 };
251
252 let mut contig = Contig::new(name_str.clone(), length);
253 contig.md5 = md5;
254 contig.assembly = assembly;
255 contig.uri = uri;
256 contig.species = species;
257 contig.aliases = aliases;
258 contigs.push(contig);
259 }
260 }
261
262 if contigs.is_empty() {
263 return Err(ParseError::InvalidFormat(
264 "No @SQ lines found in header".to_string(),
265 ));
266 }
267
268 Ok(QueryHeader::new(contigs))
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274
275 #[test]
276 fn test_parse_header_text() {
277 let header = r"@HD VN:1.6 SO:coordinate
278@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd
279@SQ SN:chr2 LN:242193529 M5:f98db672eb0993dcfdabafe2a882905c
280@SQ SN:chrM LN:16569
281@RG ID:sample1
282";
283
284 let query = parse_header_text(header).unwrap();
285 assert_eq!(query.contigs.len(), 3);
286
287 assert_eq!(query.contigs[0].name, "chr1");
288 assert_eq!(query.contigs[0].length, 248_956_422);
289 assert_eq!(
290 query.contigs[0].md5,
291 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
292 );
293
294 assert_eq!(query.contigs[1].name, "chr2");
295 assert_eq!(query.contigs[2].name, "chrM");
296 assert!(query.contigs[2].md5.is_none());
297 }
298
299 #[test]
300 fn test_parse_header_text_no_sq() {
301 let header = "@HD\tVN:1.6\n@RG\tID:sample1\n";
302 let result = parse_header_text(header);
303 assert!(result.is_err());
304 }
305
306 #[test]
307 fn test_parse_header_text_with_aliases() {
308 let header = r"@HD VN:1.6
309@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AN:1,NC_000001.11
310@SQ SN:chrM LN:16569 AN:MT,chrMT,NC_012920.1
311";
312
313 let query = parse_header_text(header).unwrap();
314 assert_eq!(query.contigs.len(), 2);
315
316 assert_eq!(query.contigs[0].name, "chr1");
318 assert_eq!(
319 query.contigs[0].aliases,
320 vec!["1".to_string(), "NC_000001.11".to_string()]
321 );
322
323 assert_eq!(query.contigs[1].name, "chrM");
325 assert_eq!(
326 query.contigs[1].aliases,
327 vec![
328 "MT".to_string(),
329 "chrMT".to_string(),
330 "NC_012920.1".to_string()
331 ]
332 );
333 }
334}