1use std::borrow::Cow;
2use std::io::BufReader;
3use std::path::Path;
4use thiserror::Error;
5use tracing::warn;
6
7use crate::core::contig::Contig;
8use crate::core::header::QueryHeader;
9use crate::utils::validation::{check_contig_limit, normalize_md5};
10
11#[derive(Error, Debug)]
12pub enum ParseError {
13 #[error("IO error: {0}")]
14 Io(#[from] std::io::Error),
15
16 #[error("Invalid SAM header format: {0}")]
17 InvalidFormat(String),
18
19 #[error("noodles error: {0}")]
20 Noodles(String),
21
22 #[error("Unsupported file format: {0}")]
23 UnsupportedFormat(String),
24
25 #[error("Too many contigs: {0} exceeds maximum allowed (100000)")]
26 TooManyContigs(usize),
27}
28
29pub fn parse_file(path: &Path) -> Result<QueryHeader, ParseError> {
38 let extension = path
39 .extension()
40 .and_then(|e| e.to_str())
41 .map(str::to_lowercase);
42
43 match extension.as_deref() {
44 Some("sam") => parse_sam_file(path),
45 Some("bam") => parse_bam_file(path),
46 Some("cram") => parse_cram_file(path),
47 Some(ext) => Err(ParseError::UnsupportedFormat(ext.to_string())),
48 None => {
49 parse_sam_file(path)
51 }
52 }
53}
54
55fn parse_sam_file(path: &Path) -> Result<QueryHeader, ParseError> {
57 use noodles::sam;
58
59 let mut reader = std::fs::File::open(path)
60 .map(BufReader::new)
61 .map(sam::io::Reader::new)?;
62
63 let header = reader
64 .read_header()
65 .map_err(|e| ParseError::Noodles(e.to_string()))?;
66
67 header_to_query(&header, Some(path))
68}
69
70fn parse_bam_file(path: &Path) -> Result<QueryHeader, ParseError> {
72 use noodles::bam;
73
74 let mut reader = std::fs::File::open(path).map(bam::io::Reader::new)?;
75
76 let header = reader
77 .read_header()
78 .map_err(|e| ParseError::Noodles(e.to_string()))?;
79
80 header_to_query(&header, Some(path))
81}
82
83fn parse_cram_file(path: &Path) -> Result<QueryHeader, ParseError> {
85 use noodles::cram;
86
87 let mut reader = std::fs::File::open(path).map(cram::io::Reader::new)?;
88
89 reader
91 .read_file_definition()
92 .map_err(|e| ParseError::Noodles(e.to_string()))?;
93
94 let header = reader
95 .read_file_header()
96 .map_err(|e| ParseError::Noodles(e.to_string()))?;
97
98 header_to_query(&header, Some(path))
99}
100
101pub fn parse_bam_from_reader<R: std::io::Read>(reader: R) -> Result<QueryHeader, ParseError> {
113 use noodles::bam;
114
115 let mut reader = bam::io::Reader::new(reader);
116
117 let header = reader
118 .read_header()
119 .map_err(|e| ParseError::Noodles(e.to_string()))?;
120
121 header_to_query(&header, None)
122}
123
124pub fn parse_cram_from_reader<R: std::io::Read>(reader: R) -> Result<QueryHeader, ParseError> {
136 use noodles::cram;
137
138 let mut reader = cram::io::Reader::new(reader);
139
140 reader
141 .read_file_definition()
142 .map_err(|e| ParseError::Noodles(e.to_string()))?;
143
144 let header = reader
145 .read_file_header()
146 .map_err(|e| ParseError::Noodles(e.to_string()))?;
147
148 header_to_query(&header, None)
149}
150
151fn header_to_query(
153 header: &noodles::sam::Header,
154 source: Option<&Path>,
155) -> Result<QueryHeader, ParseError> {
156 use noodles::sam::header::record::value::map::tag::Other;
157
158 let mut contigs = Vec::new();
159
160 for (name, map) in header.reference_sequences() {
161 let name_str = name.to_string();
162 let length = map.length().get() as u64;
163
164 let mut contig = Contig::new(name_str, length);
165
166 if let Ok(m5_tag) = Other::try_from(*b"M5") {
169 if let Some(md5_value) = map.other_fields().get(&m5_tag) {
170 let md5_str = md5_value.to_string();
171 if let Some(normalized) = normalize_md5(&md5_str) {
173 contig.md5 = Some(normalized);
174 } else {
175 warn!(
176 contig = %contig.name,
177 md5 = %md5_str,
178 "Invalid MD5 checksum format, ignoring"
179 );
180 }
181 }
182 }
183
184 if let Ok(as_tag) = Other::try_from(*b"AS") {
186 if let Some(assembly_value) = map.other_fields().get(&as_tag) {
187 contig.assembly = Some(assembly_value.to_string());
188 }
189 }
190
191 if let Ok(ur_tag) = Other::try_from(*b"UR") {
193 if let Some(uri_value) = map.other_fields().get(&ur_tag) {
194 contig.uri = Some(uri_value.to_string());
195 }
196 }
197
198 if let Ok(sp_tag) = Other::try_from(*b"SP") {
200 if let Some(species_value) = map.other_fields().get(&sp_tag) {
201 contig.species = Some(species_value.to_string());
202 }
203 }
204
205 if let Ok(an_tag) = Other::try_from(*b"AN") {
207 if let Some(aliases_value) = map.other_fields().get(&an_tag) {
208 let aliases: Vec<String> = aliases_value
209 .to_string()
210 .split(',')
211 .map(|s| s.trim().to_string())
212 .filter(|s| !s.is_empty())
213 .collect();
214 if !aliases.is_empty() {
215 contig.aliases = aliases;
216 }
217 }
218 }
219
220 if check_contig_limit(contigs.len()).is_some() {
222 return Err(ParseError::TooManyContigs(contigs.len()));
223 }
224
225 contigs.push(contig);
226 }
227
228 let mut query = QueryHeader::new(contigs);
229 if let Some(path) = source {
230 query = query.with_source(path.display().to_string());
231 }
232
233 Ok(query)
234}
235
236#[must_use]
251pub fn normalize_sam_whitespace(text: &str) -> (Cow<'_, str>, bool) {
252 if !text.lines().any(needs_space_to_tab_normalization) {
254 return (Cow::Borrowed(text), false);
255 }
256
257 let mut normalized = String::with_capacity(text.len());
258
259 for line in text.lines() {
260 if !normalized.is_empty() {
261 normalized.push('\n');
262 }
263
264 if needs_space_to_tab_normalization(line) {
265 let mut first = true;
266 for field in line.split_whitespace() {
267 if first {
268 normalized.push_str(field);
269 first = false;
270 } else {
271 normalized.push('\t');
272 normalized.push_str(field);
273 }
274 }
275 } else {
276 normalized.push_str(line);
277 }
278 }
279
280 if text.ends_with('\n') {
282 normalized.push('\n');
283 }
284
285 (Cow::Owned(normalized), true)
286}
287
288fn needs_space_to_tab_normalization(line: &str) -> bool {
294 if line.contains('\t') {
295 return false;
296 }
297
298 let sam_prefixes = ["@HD ", "@SQ ", "@RG ", "@PG "];
300 if !sam_prefixes.iter().any(|p| line.starts_with(p)) {
301 return false;
302 }
303
304 line.split_whitespace().skip(1).any(|field| {
306 field.len() >= 3
307 && field.as_bytes().get(2) == Some(&b':')
308 && field.as_bytes()[0].is_ascii_uppercase()
309 && field.as_bytes()[1].is_ascii_uppercase()
310 })
311}
312
313pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
321 let (normalized_text, _) = normalize_sam_whitespace(text);
322 let text = &normalized_text;
323 let mut contigs = Vec::new();
324
325 for line in text.lines() {
326 if !line.starts_with("@SQ") {
327 continue;
328 }
329
330 let mut name: Option<String> = None;
331 let mut length: Option<u64> = None;
332 let mut md5_raw: Option<String> = None;
333 let mut assembly: Option<String> = None;
334 let mut uri: Option<String> = None;
335 let mut species: Option<String> = None;
336 let mut aliases: Vec<String> = Vec::new();
337
338 for field in line.split('\t').skip(1) {
339 if let Some((tag, value)) = field.split_once(':') {
340 match tag {
341 "SN" => name = Some(value.to_string()),
342 "LN" => length = value.parse().ok(),
343 "M5" => md5_raw = Some(value.to_string()),
344 "AS" => assembly = Some(value.to_string()),
345 "UR" => uri = Some(value.to_string()),
346 "SP" => species = Some(value.to_string()),
347 "AN" => {
348 aliases = value
350 .split(',')
351 .map(|s| s.trim().to_string())
352 .filter(|s| !s.is_empty())
353 .collect();
354 }
355 _ => {}
356 }
357 }
358 }
359
360 if let (Some(ref name_str), Some(length)) = (&name, length) {
361 if check_contig_limit(contigs.len()).is_some() {
363 return Err(ParseError::TooManyContigs(contigs.len()));
364 }
365
366 let md5 = if let Some(ref raw) = md5_raw {
368 if let Some(normalized) = normalize_md5(raw) {
369 Some(normalized)
370 } else {
371 warn!(
372 contig = %name_str,
373 md5 = %raw,
374 "Invalid MD5 checksum format, ignoring"
375 );
376 None
377 }
378 } else {
379 None
380 };
381
382 let mut contig = Contig::new(name_str.clone(), length);
383 contig.md5 = md5;
384 contig.assembly = assembly;
385 contig.uri = uri;
386 contig.species = species;
387 contig.aliases = aliases;
388 contigs.push(contig);
389 }
390 }
391
392 if contigs.is_empty() {
393 return Err(ParseError::InvalidFormat(
394 "No @SQ lines found in header".to_string(),
395 ));
396 }
397
398 Ok(QueryHeader::new(contigs))
399}
400
401#[cfg(test)]
402mod tests {
403 use super::*;
404
405 #[test]
406 fn test_parse_header_text() {
407 let header = r"@HD VN:1.6 SO:coordinate
408@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd
409@SQ SN:chr2 LN:242193529 M5:f98db672eb0993dcfdabafe2a882905c
410@SQ SN:chrM LN:16569
411@RG ID:sample1
412";
413
414 let query = parse_header_text(header).unwrap();
415 assert_eq!(query.contigs.len(), 3);
416
417 assert_eq!(query.contigs[0].name, "chr1");
418 assert_eq!(query.contigs[0].length, 248_956_422);
419 assert_eq!(
420 query.contigs[0].md5,
421 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
422 );
423
424 assert_eq!(query.contigs[1].name, "chr2");
425 assert_eq!(query.contigs[2].name, "chrM");
426 assert!(query.contigs[2].md5.is_none());
427 }
428
429 #[test]
430 fn test_parse_header_text_no_sq() {
431 let header = "@HD\tVN:1.6\n@RG\tID:sample1\n";
432 let result = parse_header_text(header);
433 assert!(result.is_err());
434 }
435
436 #[test]
437 fn test_parse_header_text_with_aliases() {
438 let header = r"@HD VN:1.6
439@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AN:1,NC_000001.11
440@SQ SN:chrM LN:16569 AN:MT,chrMT,NC_012920.1
441";
442
443 let query = parse_header_text(header).unwrap();
444 assert_eq!(query.contigs.len(), 2);
445
446 assert_eq!(query.contigs[0].name, "chr1");
448 assert_eq!(
449 query.contigs[0].aliases,
450 vec!["1".to_string(), "NC_000001.11".to_string()]
451 );
452
453 assert_eq!(query.contigs[1].name, "chrM");
455 assert_eq!(
456 query.contigs[1].aliases,
457 vec![
458 "MT".to_string(),
459 "chrMT".to_string(),
460 "NC_012920.1".to_string()
461 ]
462 );
463 }
464
465 #[test]
466 fn test_normalize_sam_whitespace_spaces_to_tabs() {
467 let input = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
468 let (normalized, was_normalized) = normalize_sam_whitespace(input);
469 assert!(was_normalized);
470 assert_eq!(
471 normalized,
472 "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n"
473 );
474 }
475
476 #[test]
477 fn test_normalize_sam_whitespace_already_tabs() {
478 let input = "@SQ\tSN:chr1\tLN:248956422\n";
479 let (normalized, was_normalized) = normalize_sam_whitespace(input);
480 assert!(!was_normalized);
481 assert_eq!(normalized, input);
482 }
483
484 #[test]
485 fn test_normalize_sam_whitespace_mixed_lines() {
486 let input =
487 "@HD VN:1.6 SO:coordinate\n@SQ SN:chr1 LN:248956422\n@SQ SN:chr2 LN:242193529\n";
488 let (normalized, was_normalized) = normalize_sam_whitespace(input);
489 assert!(was_normalized);
490 assert!(normalized.contains("@HD\tVN:1.6\tSO:coordinate"));
491 assert!(normalized.contains("@SQ\tSN:chr1\tLN:248956422"));
492 assert!(normalized.contains("@SQ\tSN:chr2\tLN:242193529"));
493 }
494
495 #[test]
496 fn test_normalize_sam_whitespace_multiple_spaces() {
497 let input = "@SQ SN:chr1 LN:248956422\n";
498 let (normalized, was_normalized) = normalize_sam_whitespace(input);
499 assert!(was_normalized);
500 assert_eq!(normalized, "@SQ\tSN:chr1\tLN:248956422\n");
501 }
502
503 #[test]
504 fn test_normalize_sam_whitespace_preserves_non_header_lines() {
505 let input = "some random text with spaces\n@SQ SN:chr1 LN:100\n";
506 let (normalized, was_normalized) = normalize_sam_whitespace(input);
507 assert!(was_normalized);
508 assert!(normalized.starts_with("some random text with spaces\n"));
509 assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
510 }
511
512 #[test]
513 fn test_normalize_sam_whitespace_tabs_and_spaces_mixed() {
514 let input = "@SQ\tSN:chr1 LN:248956422\n";
516 let (normalized, was_normalized) = normalize_sam_whitespace(input);
517 assert!(!was_normalized);
518 assert_eq!(normalized, input);
519 }
520
521 #[test]
522 fn test_normalize_sam_whitespace_skips_comment_lines() {
523 let input = "@CO This is a comment with VN:1.0 mentioned\n@SQ SN:chr1 LN:100\n";
524 let (normalized, was_normalized) = normalize_sam_whitespace(input);
525 assert!(was_normalized);
526 assert!(normalized.starts_with("@CO This is a comment with VN:1.0 mentioned\n"));
528 assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
529 }
530
531 #[test]
532 fn test_parse_header_text_with_spaces() {
533 let header = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n\
534 @SQ SN:chr2 LN:242193529\n";
535 let query = parse_header_text(header).unwrap();
536 assert_eq!(query.contigs.len(), 2);
537 assert_eq!(query.contigs[0].name, "chr1");
538 assert_eq!(query.contigs[0].length, 248_956_422);
539 assert_eq!(
540 query.contigs[0].md5,
541 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
542 );
543 assert_eq!(query.contigs[1].name, "chr2");
544 assert_eq!(query.contigs[1].length, 242_193_529);
545 }
546}
547
548#[cfg(test)]
549mod reader_tests {
550 use super::*;
551 use std::io::Cursor;
552
553 fn create_test_bam(contigs: &[(&str, usize)]) -> Vec<u8> {
555 use noodles::bam;
556 use noodles::sam;
557 use noodles::sam::header::record::value::map::ReferenceSequence;
558 use noodles::sam::header::record::value::Map;
559 use std::num::NonZeroUsize;
560
561 let mut header = sam::Header::builder();
562 for &(name, length) in contigs {
563 header = header.add_reference_sequence(
564 name,
565 Map::<ReferenceSequence>::new(NonZeroUsize::new(length).unwrap()),
566 );
567 }
568 let header = header.build();
569
570 let mut buf = Vec::new();
571 {
572 let mut writer = bam::io::Writer::new(&mut buf);
573 writer.write_header(&header).unwrap();
574 }
575 buf
576 }
577
578 #[test]
579 fn test_parse_bam_from_reader_basic() {
580 let bam_bytes = create_test_bam(&[("chr1", 248_956_422), ("chr2", 242_193_529)]);
581 let cursor = Cursor::new(&bam_bytes);
582 let query = parse_bam_from_reader(cursor).unwrap();
583 assert_eq!(query.contigs.len(), 2);
584 assert_eq!(query.contigs[0].name, "chr1");
585 assert_eq!(query.contigs[0].length, 248_956_422);
586 assert_eq!(query.contigs[1].name, "chr2");
587 assert_eq!(query.contigs[1].length, 242_193_529);
588 }
589
590 #[test]
591 fn test_parse_bam_from_reader_truncated_after_header() {
592 let mut bam_bytes = create_test_bam(&[("chr1", 248_956_422)]);
593 bam_bytes.extend_from_slice(&[0u8; 100]);
594 let cursor = Cursor::new(&bam_bytes);
595 let query = parse_bam_from_reader(cursor).unwrap();
596 assert_eq!(query.contigs.len(), 1);
597 assert_eq!(query.contigs[0].name, "chr1");
598 }
599
600 #[test]
601 fn test_parse_bam_from_reader_empty() {
602 let cursor = Cursor::new(Vec::<u8>::new());
603 let result = parse_bam_from_reader(cursor);
604 assert!(result.is_err());
605 }
606
607 fn create_test_cram(contigs: &[(&str, usize, &str)]) -> Vec<u8> {
612 use noodles::cram;
613 use noodles::sam;
614 use noodles::sam::header::record::value::map::reference_sequence::tag;
615 use noodles::sam::header::record::value::map::ReferenceSequence;
616 use noodles::sam::header::record::value::Map;
617 use std::num::NonZeroUsize;
618
619 let mut header = sam::Header::builder();
620 for &(name, length, md5) in contigs {
621 let map = Map::<ReferenceSequence>::builder()
622 .set_length(NonZeroUsize::new(length).unwrap())
623 .insert(tag::MD5_CHECKSUM, md5)
624 .build()
625 .unwrap();
626 header = header.add_reference_sequence(name, map);
627 }
628 let header = header.build();
629
630 let mut buf = Vec::new();
631 {
632 let mut writer = cram::io::Writer::new(&mut buf);
633 writer.write_file_definition().unwrap();
634 writer.write_file_header(&header).unwrap();
635 }
636 buf
637 }
638
639 #[test]
640 fn test_parse_cram_from_reader_basic() {
641 let cram_bytes = create_test_cram(&[
642 ("chr1", 248_956_422, "6aef897c3d6ff0c78aff06ac189178dd"),
643 ("chrX", 156_040_895, "01234567890abcdef01234567890abcd"),
644 ]);
645 let cursor = Cursor::new(&cram_bytes);
646 let query = parse_cram_from_reader(cursor).unwrap();
647 assert_eq!(query.contigs.len(), 2);
648 assert_eq!(query.contigs[0].name, "chr1");
649 assert_eq!(query.contigs[0].length, 248_956_422);
650 assert_eq!(
651 query.contigs[0].md5,
652 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
653 );
654 assert_eq!(query.contigs[1].name, "chrX");
655 assert_eq!(query.contigs[1].length, 156_040_895);
656 assert_eq!(
657 query.contigs[1].md5,
658 Some("01234567890abcdef01234567890abcd".to_string())
659 );
660 }
661
662 #[test]
663 fn test_parse_cram_from_reader_truncated_after_header() {
664 let mut cram_bytes =
665 create_test_cram(&[("chr1", 248_956_422, "6aef897c3d6ff0c78aff06ac189178dd")]);
666 cram_bytes.extend_from_slice(&[0u8; 100]);
669 let cursor = Cursor::new(&cram_bytes);
670 let query = parse_cram_from_reader(cursor).unwrap();
671 assert_eq!(query.contigs.len(), 1);
672 assert_eq!(query.contigs[0].name, "chr1");
673 }
674
675 #[test]
676 fn test_parse_cram_from_reader_empty() {
677 let cursor = Cursor::new(Vec::<u8>::new());
678 let result = parse_cram_from_reader(cursor);
679 assert!(result.is_err());
680 }
681}