use std::borrow::Cow;
use std::io::BufReader;
use std::path::Path;
use thiserror::Error;
use tracing::warn;
use crate::core::contig::Contig;
use crate::core::header::QueryHeader;
use crate::utils::validation::{check_contig_limit, normalize_md5};
#[derive(Error, Debug)]
pub enum ParseError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid SAM header format: {0}")]
InvalidFormat(String),
#[error("noodles error: {0}")]
Noodles(String),
#[error("Unsupported file format: {0}")]
UnsupportedFormat(String),
#[error("Too many contigs: {0} exceeds maximum allowed (100000)")]
TooManyContigs(usize),
}
pub fn parse_file(path: &Path) -> Result<QueryHeader, ParseError> {
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(str::to_lowercase);
match extension.as_deref() {
Some("sam") => parse_sam_file(path),
Some("bam") => parse_bam_file(path),
Some("cram") => parse_cram_file(path),
Some(ext) => Err(ParseError::UnsupportedFormat(ext.to_string())),
None => {
parse_sam_file(path)
}
}
}
fn parse_sam_file(path: &Path) -> Result<QueryHeader, ParseError> {
use noodles::sam;
let mut reader = std::fs::File::open(path)
.map(BufReader::new)
.map(sam::io::Reader::new)?;
let header = reader
.read_header()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
header_to_query(&header, Some(path))
}
fn parse_bam_file(path: &Path) -> Result<QueryHeader, ParseError> {
use noodles::bam;
let mut reader = std::fs::File::open(path).map(bam::io::Reader::new)?;
let header = reader
.read_header()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
header_to_query(&header, Some(path))
}
fn parse_cram_file(path: &Path) -> Result<QueryHeader, ParseError> {
use noodles::cram;
let mut reader = std::fs::File::open(path).map(cram::io::Reader::new)?;
reader
.read_file_definition()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
let header = reader
.read_file_header()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
header_to_query(&header, Some(path))
}
pub fn parse_bam_from_reader<R: std::io::Read>(reader: R) -> Result<QueryHeader, ParseError> {
use noodles::bam;
let mut reader = bam::io::Reader::new(reader);
let header = reader
.read_header()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
header_to_query(&header, None)
}
pub fn parse_cram_from_reader<R: std::io::Read>(reader: R) -> Result<QueryHeader, ParseError> {
use noodles::cram;
let mut reader = cram::io::Reader::new(reader);
reader
.read_file_definition()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
let header = reader
.read_file_header()
.map_err(|e| ParseError::Noodles(e.to_string()))?;
header_to_query(&header, None)
}
fn header_to_query(
header: &noodles::sam::Header,
source: Option<&Path>,
) -> Result<QueryHeader, ParseError> {
use noodles::sam::header::record::value::map::tag::Other;
let mut contigs = Vec::new();
for (name, map) in header.reference_sequences() {
let name_str = name.to_string();
let length = map.length().get() as u64;
let mut contig = Contig::new(name_str, length);
if let Ok(m5_tag) = Other::try_from(*b"M5") {
if let Some(md5_value) = map.other_fields().get(&m5_tag) {
let md5_str = md5_value.to_string();
if let Some(normalized) = normalize_md5(&md5_str) {
contig.md5 = Some(normalized);
} else {
warn!(
contig = %contig.name,
md5 = %md5_str,
"Invalid MD5 checksum format, ignoring"
);
}
}
}
if let Ok(as_tag) = Other::try_from(*b"AS") {
if let Some(assembly_value) = map.other_fields().get(&as_tag) {
contig.assembly = Some(assembly_value.to_string());
}
}
if let Ok(ur_tag) = Other::try_from(*b"UR") {
if let Some(uri_value) = map.other_fields().get(&ur_tag) {
contig.uri = Some(uri_value.to_string());
}
}
if let Ok(sp_tag) = Other::try_from(*b"SP") {
if let Some(species_value) = map.other_fields().get(&sp_tag) {
contig.species = Some(species_value.to_string());
}
}
if let Ok(an_tag) = Other::try_from(*b"AN") {
if let Some(aliases_value) = map.other_fields().get(&an_tag) {
let aliases: Vec<String> = aliases_value
.to_string()
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if !aliases.is_empty() {
contig.aliases = aliases;
}
}
}
if check_contig_limit(contigs.len()).is_some() {
return Err(ParseError::TooManyContigs(contigs.len()));
}
contigs.push(contig);
}
let mut query = QueryHeader::new(contigs);
if let Some(path) = source {
query = query.with_source(path.display().to_string());
}
Ok(query)
}
#[must_use]
pub fn normalize_sam_whitespace(text: &str) -> (Cow<'_, str>, bool) {
if !text.lines().any(needs_space_to_tab_normalization) {
return (Cow::Borrowed(text), false);
}
let mut normalized = String::with_capacity(text.len());
for line in text.lines() {
if !normalized.is_empty() {
normalized.push('\n');
}
if needs_space_to_tab_normalization(line) {
let mut first = true;
for field in line.split_whitespace() {
if first {
normalized.push_str(field);
first = false;
} else {
normalized.push('\t');
normalized.push_str(field);
}
}
} else {
normalized.push_str(line);
}
}
if text.ends_with('\n') {
normalized.push('\n');
}
(Cow::Owned(normalized), true)
}
fn needs_space_to_tab_normalization(line: &str) -> bool {
if line.contains('\t') {
return false;
}
let sam_prefixes = ["@HD ", "@SQ ", "@RG ", "@PG "];
if !sam_prefixes.iter().any(|p| line.starts_with(p)) {
return false;
}
line.split_whitespace().skip(1).any(|field| {
field.len() >= 3
&& field.as_bytes().get(2) == Some(&b':')
&& field.as_bytes()[0].is_ascii_uppercase()
&& field.as_bytes()[1].is_ascii_uppercase()
})
}
pub fn parse_header_text(text: &str) -> Result<QueryHeader, ParseError> {
let (normalized_text, _) = normalize_sam_whitespace(text);
let text = &normalized_text;
let mut contigs = Vec::new();
for line in text.lines() {
if !line.starts_with("@SQ") {
continue;
}
let mut name: Option<String> = None;
let mut length: Option<u64> = None;
let mut md5_raw: Option<String> = None;
let mut assembly: Option<String> = None;
let mut uri: Option<String> = None;
let mut species: Option<String> = None;
let mut aliases: Vec<String> = Vec::new();
for field in line.split('\t').skip(1) {
if let Some((tag, value)) = field.split_once(':') {
match tag {
"SN" => name = Some(value.to_string()),
"LN" => length = value.parse().ok(),
"M5" => md5_raw = Some(value.to_string()),
"AS" => assembly = Some(value.to_string()),
"UR" => uri = Some(value.to_string()),
"SP" => species = Some(value.to_string()),
"AN" => {
aliases = value
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
_ => {}
}
}
}
if let (Some(ref name_str), Some(length)) = (&name, length) {
if check_contig_limit(contigs.len()).is_some() {
return Err(ParseError::TooManyContigs(contigs.len()));
}
let md5 = if let Some(ref raw) = md5_raw {
if let Some(normalized) = normalize_md5(raw) {
Some(normalized)
} else {
warn!(
contig = %name_str,
md5 = %raw,
"Invalid MD5 checksum format, ignoring"
);
None
}
} else {
None
};
let mut contig = Contig::new(name_str.clone(), length);
contig.md5 = md5;
contig.assembly = assembly;
contig.uri = uri;
contig.species = species;
contig.aliases = aliases;
contigs.push(contig);
}
}
if contigs.is_empty() {
return Err(ParseError::InvalidFormat(
"No @SQ lines found in header".to_string(),
));
}
Ok(QueryHeader::new(contigs))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_header_text() {
let header = r"@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd
@SQ SN:chr2 LN:242193529 M5:f98db672eb0993dcfdabafe2a882905c
@SQ SN:chrM LN:16569
@RG ID:sample1
";
let query = parse_header_text(header).unwrap();
assert_eq!(query.contigs.len(), 3);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert_eq!(
query.contigs[0].md5,
Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
);
assert_eq!(query.contigs[1].name, "chr2");
assert_eq!(query.contigs[2].name, "chrM");
assert!(query.contigs[2].md5.is_none());
}
#[test]
fn test_parse_header_text_no_sq() {
let header = "@HD\tVN:1.6\n@RG\tID:sample1\n";
let result = parse_header_text(header);
assert!(result.is_err());
}
#[test]
fn test_parse_header_text_with_aliases() {
let header = r"@HD VN:1.6
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AN:1,NC_000001.11
@SQ SN:chrM LN:16569 AN:MT,chrMT,NC_012920.1
";
let query = parse_header_text(header).unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(
query.contigs[0].aliases,
vec!["1".to_string(), "NC_000001.11".to_string()]
);
assert_eq!(query.contigs[1].name, "chrM");
assert_eq!(
query.contigs[1].aliases,
vec![
"MT".to_string(),
"chrMT".to_string(),
"NC_012920.1".to_string()
]
);
}
#[test]
fn test_normalize_sam_whitespace_spaces_to_tabs() {
let input = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(was_normalized);
assert_eq!(
normalized,
"@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd\n"
);
}
#[test]
fn test_normalize_sam_whitespace_already_tabs() {
let input = "@SQ\tSN:chr1\tLN:248956422\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(!was_normalized);
assert_eq!(normalized, input);
}
#[test]
fn test_normalize_sam_whitespace_mixed_lines() {
let input =
"@HD VN:1.6 SO:coordinate\n@SQ SN:chr1 LN:248956422\n@SQ SN:chr2 LN:242193529\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(was_normalized);
assert!(normalized.contains("@HD\tVN:1.6\tSO:coordinate"));
assert!(normalized.contains("@SQ\tSN:chr1\tLN:248956422"));
assert!(normalized.contains("@SQ\tSN:chr2\tLN:242193529"));
}
#[test]
fn test_normalize_sam_whitespace_multiple_spaces() {
let input = "@SQ SN:chr1 LN:248956422\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(was_normalized);
assert_eq!(normalized, "@SQ\tSN:chr1\tLN:248956422\n");
}
#[test]
fn test_normalize_sam_whitespace_preserves_non_header_lines() {
let input = "some random text with spaces\n@SQ SN:chr1 LN:100\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(was_normalized);
assert!(normalized.starts_with("some random text with spaces\n"));
assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
}
#[test]
fn test_normalize_sam_whitespace_tabs_and_spaces_mixed() {
let input = "@SQ\tSN:chr1 LN:248956422\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(!was_normalized);
assert_eq!(normalized, input);
}
#[test]
fn test_normalize_sam_whitespace_skips_comment_lines() {
let input = "@CO This is a comment with VN:1.0 mentioned\n@SQ SN:chr1 LN:100\n";
let (normalized, was_normalized) = normalize_sam_whitespace(input);
assert!(was_normalized);
assert!(normalized.starts_with("@CO This is a comment with VN:1.0 mentioned\n"));
assert!(normalized.contains("@SQ\tSN:chr1\tLN:100"));
}
#[test]
fn test_parse_header_text_with_spaces() {
let header = "@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd\n\
@SQ SN:chr2 LN:242193529\n";
let query = parse_header_text(header).unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert_eq!(
query.contigs[0].md5,
Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
);
assert_eq!(query.contigs[1].name, "chr2");
assert_eq!(query.contigs[1].length, 242_193_529);
}
}
#[cfg(test)]
mod reader_tests {
use super::*;
use std::io::Cursor;
fn create_test_bam(contigs: &[(&str, usize)]) -> Vec<u8> {
use noodles::bam;
use noodles::sam;
use noodles::sam::header::record::value::map::ReferenceSequence;
use noodles::sam::header::record::value::Map;
use std::num::NonZeroUsize;
let mut header = sam::Header::builder();
for &(name, length) in contigs {
header = header.add_reference_sequence(
name,
Map::<ReferenceSequence>::new(NonZeroUsize::new(length).unwrap()),
);
}
let header = header.build();
let mut buf = Vec::new();
{
let mut writer = bam::io::Writer::new(&mut buf);
writer.write_header(&header).unwrap();
}
buf
}
#[test]
fn test_parse_bam_from_reader_basic() {
let bam_bytes = create_test_bam(&[("chr1", 248_956_422), ("chr2", 242_193_529)]);
let cursor = Cursor::new(&bam_bytes);
let query = parse_bam_from_reader(cursor).unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert_eq!(query.contigs[1].name, "chr2");
assert_eq!(query.contigs[1].length, 242_193_529);
}
#[test]
fn test_parse_bam_from_reader_truncated_after_header() {
let mut bam_bytes = create_test_bam(&[("chr1", 248_956_422)]);
bam_bytes.extend_from_slice(&[0u8; 100]);
let cursor = Cursor::new(&bam_bytes);
let query = parse_bam_from_reader(cursor).unwrap();
assert_eq!(query.contigs.len(), 1);
assert_eq!(query.contigs[0].name, "chr1");
}
#[test]
fn test_parse_bam_from_reader_empty() {
let cursor = Cursor::new(Vec::<u8>::new());
let result = parse_bam_from_reader(cursor);
assert!(result.is_err());
}
fn create_test_cram(contigs: &[(&str, usize, &str)]) -> Vec<u8> {
use noodles::cram;
use noodles::sam;
use noodles::sam::header::record::value::map::reference_sequence::tag;
use noodles::sam::header::record::value::map::ReferenceSequence;
use noodles::sam::header::record::value::Map;
use std::num::NonZeroUsize;
let mut header = sam::Header::builder();
for &(name, length, md5) in contigs {
let map = Map::<ReferenceSequence>::builder()
.set_length(NonZeroUsize::new(length).unwrap())
.insert(tag::MD5_CHECKSUM, md5)
.build()
.unwrap();
header = header.add_reference_sequence(name, map);
}
let header = header.build();
let mut buf = Vec::new();
{
let mut writer = cram::io::Writer::new(&mut buf);
writer.write_file_definition().unwrap();
writer.write_file_header(&header).unwrap();
}
buf
}
#[test]
fn test_parse_cram_from_reader_basic() {
let cram_bytes = create_test_cram(&[
("chr1", 248_956_422, "6aef897c3d6ff0c78aff06ac189178dd"),
("chrX", 156_040_895, "01234567890abcdef01234567890abcd"),
]);
let cursor = Cursor::new(&cram_bytes);
let query = parse_cram_from_reader(cursor).unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert_eq!(
query.contigs[0].md5,
Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
);
assert_eq!(query.contigs[1].name, "chrX");
assert_eq!(query.contigs[1].length, 156_040_895);
assert_eq!(
query.contigs[1].md5,
Some("01234567890abcdef01234567890abcd".to_string())
);
}
#[test]
fn test_parse_cram_from_reader_truncated_after_header() {
let mut cram_bytes =
create_test_cram(&[("chr1", 248_956_422, "6aef897c3d6ff0c78aff06ac189178dd")]);
cram_bytes.extend_from_slice(&[0u8; 100]);
let cursor = Cursor::new(&cram_bytes);
let query = parse_cram_from_reader(cursor).unwrap();
assert_eq!(query.contigs.len(), 1);
assert_eq!(query.contigs[0].name, "chr1");
}
#[test]
fn test_parse_cram_from_reader_empty() {
let cursor = Cursor::new(Vec::<u8>::new());
let result = parse_cram_from_reader(cursor);
assert!(result.is_err());
}
}