use crate::alignment::{SamHeader, SamRecord};
use crate::error::Result;
use flate2::Compression;
use flate2::write::GzEncoder;
use std::io::Write;
const BAM_MAGIC: &[u8; 4] = b"BAM\x01";
#[derive(Debug, Clone)]
pub struct BamFile {
pub header: SamHeader,
pub references: Vec<(String, u32)>,
pub records: Vec<BamRecord>,
}
impl BamFile {
pub fn new(header: SamHeader) -> Self {
BamFile {
header,
references: Vec::new(),
records: Vec::new(),
}
}
pub fn add_reference(&mut self, name: String, length: u32) {
self.references.push((name, length));
}
pub fn add_record(&mut self, record: BamRecord) {
self.records.push(record);
}
pub fn to_bytes(&self) -> Result<Vec<u8>> {
let mut uncompressed = Vec::new();
uncompressed.extend_from_slice(BAM_MAGIC);
let header_text = self.header.to_header_lines().join("\n");
let header_bytes = header_text.as_bytes();
write_le_i32(&mut uncompressed, header_bytes.len() as i32);
uncompressed.extend_from_slice(header_bytes);
write_le_i32(&mut uncompressed, self.references.len() as i32);
for (name, length) in &self.references {
let name_bytes = name.as_bytes();
write_le_i32(&mut uncompressed, (name_bytes.len() + 1) as i32);
uncompressed.extend_from_slice(name_bytes);
uncompressed.push(0); write_le_u32(&mut uncompressed, *length);
}
for record in &self.records {
let record_bytes = record.to_bytes()?;
write_le_i32(&mut uncompressed, record_bytes.len() as i32);
uncompressed.extend_from_slice(&record_bytes);
}
let mut compressed = Vec::new();
{
let mut encoder = GzEncoder::new(&mut compressed, Compression::fast());
encoder.write_all(&uncompressed)
.map_err(|e| crate::error::Error::Custom(format!("BGZF compression failed: {}", e)))?;
encoder.finish()
.map_err(|e| crate::error::Error::Custom(format!("BGZF flush failed: {}", e)))?;
}
Ok(compressed)
}
pub fn from_bytes(compressed_data: &[u8]) -> Result<Self> {
use flate2::read::GzDecoder;
use std::io::Read;
let mut decoder = GzDecoder::new(compressed_data);
let mut data = Vec::new();
decoder.read_to_end(&mut data)
.map_err(|e| crate::error::Error::Custom(format!("BGZF decompression failed: {}", e)))?;
let mut cursor = 0;
if cursor + 4 > data.len() || &data[cursor..cursor + 4] != BAM_MAGIC {
return Err(crate::error::Error::Custom("Invalid BAM magic bytes".to_string()));
}
cursor += 4;
let header_len = read_le_i32(&data, &mut cursor)? as usize;
if cursor + header_len > data.len() {
return Err(crate::error::Error::Custom("Invalid BAM header size".to_string()));
}
let _header_text = String::from_utf8(data[cursor..cursor + header_len].to_vec())
.map_err(|e| crate::error::Error::Custom(format!("Invalid UTF-8 in BAM header: {}", e)))?;
cursor += header_len;
let header = SamHeader::new("1.0");
let ref_count = read_le_i32(&data, &mut cursor)? as usize;
let mut references = Vec::new();
for _ in 0..ref_count {
let name_len = read_le_i32(&data, &mut cursor)? as usize;
if cursor + name_len > data.len() {
return Err(crate::error::Error::Custom("Invalid reference name size".to_string()));
}
let name = String::from_utf8(data[cursor..cursor + name_len - 1].to_vec())
.map_err(|e| crate::error::Error::Custom(format!("Invalid UTF-8 in reference name: {}", e)))?;
cursor += name_len;
let length = read_le_u32(&data, &mut cursor)?;
references.push((name, length));
}
let mut records = Vec::new();
while cursor < data.len() {
if cursor + 4 > data.len() {
break;
}
let record_len = read_le_i32(&data, &mut cursor)? as usize;
if cursor + record_len > data.len() {
break;
}
let record = BamRecord::from_bytes(&data[cursor..cursor + record_len])?;
records.push(record);
cursor += record_len;
}
Ok(BamFile {
header,
references,
records,
})
}
}
#[derive(Debug, Clone)]
pub struct BamRecord {
pub ref_id: i32,
pub pos: i32,
pub bin_mq_nl: u32,
pub flag_nc: u32,
pub l_seq: i32,
pub next_ref_id: i32,
pub next_pos: i32,
pub tlen: i32,
pub read_name: String,
pub cigar: Vec<(u32, u8)>, pub seq: Vec<u8>,
pub qual: Vec<u8>,
}
impl BamRecord {
pub fn from_sam(sam: &SamRecord, ref_id: i32) -> Self {
BamRecord {
ref_id,
pos: sam.pos as i32,
bin_mq_nl: 0, flag_nc: (sam.flag as u32) << 16, l_seq: sam.query_seq.len() as i32,
next_ref_id: -1,
next_pos: -1,
tlen: 0,
read_name: sam.qname.clone(),
cigar: Self::parse_cigar(&sam.cigar),
seq: Self::encode_seq(&sam.query_seq),
qual: sam.query_qual.as_bytes().to_vec(),
}
}
fn encode_seq(seq: &str) -> Vec<u8> {
let bases = vec!['=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'];
let mut encoded = vec![0u8; (seq.len() + 1) / 2];
for (i, base) in seq.chars().enumerate() {
let code = bases.iter().position(|&b| b == base).unwrap_or(0) as u8;
if i % 2 == 0 {
encoded[i / 2] |= code << 4;
} else {
encoded[i / 2] |= code;
}
}
encoded
}
fn parse_cigar(cigar: &str) -> Vec<(u32, u8)> {
let mut result = Vec::new();
let mut current_num = String::new();
for c in cigar.chars() {
if c.is_numeric() {
current_num.push(c);
} else {
if !current_num.is_empty() {
if let Ok(len) = current_num.parse::<u32>() {
let op = match c {
'M' => 0,
'I' => 1,
'D' => 2,
'N' => 3,
'S' => 4,
'H' => 5,
'P' => 6,
'=' => 7,
'X' => 8,
_ => 0,
};
result.push((len, op));
}
current_num.clear();
}
}
}
result
}
pub fn format_cigar(ops: &[(u32, u8)]) -> String {
let op_map = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'];
ops.iter()
.map(|(len, op)| format!("{}{}", len, op_map[*op as usize]))
.collect::<Vec<_>>()
.join("")
}
fn to_bytes(&self) -> Result<Vec<u8>> {
let mut bytes = Vec::new();
write_le_i32(&mut bytes, self.ref_id);
write_le_i32(&mut bytes, self.pos);
write_le_u32(&mut bytes, self.bin_mq_nl);
write_le_u32(&mut bytes, self.flag_nc | self.cigar.len() as u32);
write_le_i32(&mut bytes, self.l_seq);
write_le_i32(&mut bytes, self.next_ref_id);
write_le_i32(&mut bytes, self.next_pos);
write_le_i32(&mut bytes, self.tlen);
bytes.extend_from_slice(self.read_name.as_bytes());
bytes.push(0);
for (len, op) in &self.cigar {
write_le_u32(&mut bytes, (len << 4) | (*op as u32));
}
bytes.extend_from_slice(&self.seq);
bytes.extend_from_slice(&self.qual);
Ok(bytes)
}
fn from_bytes(data: &[u8]) -> Result<Self> {
let mut cursor = 0;
let ref_id = read_le_i32(data, &mut cursor)?;
let pos = read_le_i32(data, &mut cursor)?;
let bin_mq_nl = read_le_u32(data, &mut cursor)?;
let flag_nc = read_le_u32(data, &mut cursor)?;
let l_seq = read_le_i32(data, &mut cursor)?;
let next_ref_id = read_le_i32(data, &mut cursor)?;
let next_pos = read_le_i32(data, &mut cursor)?;
let tlen = read_le_i32(data, &mut cursor)?;
let name_end = data[cursor..]
.iter()
.position(|&b| b == 0)
.ok_or_else(|| crate::error::Error::Custom("Invalid BAM record: no null terminator".to_string()))?;
let read_name = String::from_utf8(data[cursor..cursor + name_end].to_vec())
.map_err(|e| crate::error::Error::Custom(format!("Invalid UTF-8 in read name: {}", e)))?;
cursor += name_end + 1;
let cigar_count = (flag_nc & 0xFFFF) as usize;
let mut cigar = Vec::new();
for _ in 0..cigar_count {
let val = read_le_u32(data, &mut cursor)?;
let len = val >> 4;
let op = (val & 0xF) as u8;
cigar.push((len, op));
}
let seq_bytes = (l_seq + 1) / 2;
let seq = data[cursor..cursor + seq_bytes as usize].to_vec();
cursor += seq_bytes as usize;
let qual = data[cursor..cursor + l_seq as usize].to_vec();
Ok(BamRecord {
ref_id,
pos,
bin_mq_nl,
flag_nc: flag_nc & 0xFFFF0000,
l_seq,
next_ref_id,
next_pos,
tlen,
read_name,
cigar,
seq,
qual,
})
}
}
fn write_le_i32(bytes: &mut Vec<u8>, val: i32) {
bytes.extend_from_slice(&val.to_le_bytes());
}
fn write_le_u32(bytes: &mut Vec<u8>, val: u32) {
bytes.extend_from_slice(&val.to_le_bytes());
}
fn read_le_i32(data: &[u8], cursor: &mut usize) -> Result<i32> {
if *cursor + 4 > data.len() {
return Err(crate::error::Error::Custom("BAM: insufficient data for i32".to_string()));
}
let mut bytes = [0u8; 4];
bytes.copy_from_slice(&data[*cursor..*cursor + 4]);
*cursor += 4;
Ok(i32::from_le_bytes(bytes))
}
fn read_le_u32(data: &[u8], cursor: &mut usize) -> Result<u32> {
if *cursor + 4 > data.len() {
return Err(crate::error::Error::Custom("BAM: insufficient data for u32".to_string()));
}
let mut bytes = [0u8; 4];
bytes.copy_from_slice(&data[*cursor..*cursor + 4]);
*cursor += 4;
Ok(u32::from_le_bytes(bytes))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bam_file_creation() {
let header = SamHeader::new("1.0");
let mut bam = BamFile::new(header);
bam.add_reference("chr1".to_string(), 1000);
assert_eq!(bam.references.len(), 1);
assert_eq!(bam.references[0].0, "chr1");
assert_eq!(bam.references[0].1, 1000);
}
#[test]
fn test_bam_serialization() -> Result<()> {
let header = SamHeader::new("1.0");
let mut bam = BamFile::new(header);
bam.add_reference("chr1".to_string(), 1000);
let bytes = bam.to_bytes()?;
assert!(!bytes.is_empty());
let gzip_magic = [31u8, 139u8, 8u8, 0u8];
assert_eq!(&bytes[0..4], &gzip_magic, "BAM file should start with BGZF (gzip) magic bytes");
Ok(())
}
#[test]
#[test]
fn test_bam_compression_roundtrip() {
use flate2::read::GzDecoder;
use std::io::Read;
let header = SamHeader::new("1.0");
let mut bam = BamFile::new(header);
bam.add_reference("chr1".to_string(), 1000);
let compressed = bam.to_bytes().expect("Failed to serialize BAM");
let mut decoder = GzDecoder::new(&*compressed);
let mut decompressed = Vec::new();
decoder.read_to_end(&mut decompressed).expect("Failed to decompress");
assert_eq!(&decompressed[0..4], BAM_MAGIC, "Decompressed data should start with BAM magic");
}
#[test]
fn test_sequence_encoding() {
let encoded = BamRecord::encode_seq("ACGT");
assert!(!encoded.is_empty());
}
#[test]
fn test_cigar_parsing() {
let cigar = BamRecord::parse_cigar("10M2I5D3M");
assert_eq!(cigar.len(), 4);
assert_eq!(cigar[0], (10, 0)); assert_eq!(cigar[1], (2, 1)); assert_eq!(cigar[2], (5, 2)); assert_eq!(cigar[3], (3, 0)); }
#[test]
fn test_cigar_formatting() {
let ops = vec![(10, 0), (2, 1), (5, 2), (3, 0)];
let cigar = BamRecord::format_cigar(&ops);
assert_eq!(cigar, "10M2I5D3M");
}
#[test]
fn test_invalid_utf8_in_bam_header() {
let mut data = Vec::from(BAM_MAGIC);
data.extend_from_slice(&(4i32).to_le_bytes());
data.extend_from_slice(&[0xFF, 0xFE, 0xFD, 0xFC]);
data.extend_from_slice(&(0i32).to_le_bytes());
let result = BamFile::from_bytes(&data);
assert!(result.is_err(), "Should error on invalid UTF-8 in header");
}
#[test]
fn test_invalid_utf8_in_reference_name() {
let mut data = Vec::from(BAM_MAGIC);
data.extend_from_slice(&(0i32).to_le_bytes());
data.extend_from_slice(&(1i32).to_le_bytes()); data.extend_from_slice(&(5i32).to_le_bytes()); data.extend_from_slice(&[0xFF, 0xFE, 0xFD, 0x00]); data.extend_from_slice(&(1000i32).to_le_bytes());
let result = BamFile::from_bytes(&data);
assert!(result.is_err(), "Should error on invalid UTF-8 in reference name");
}
#[test]
fn test_invalid_utf8_in_read_name() {
let mut data = Vec::new();
data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(100i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes()); data.extend_from_slice(&(0i32).to_le_bytes());
data.extend_from_slice(&[0xFF, 0xFE, 0x00]);
let result = BamRecord::from_bytes(&data);
assert!(result.is_err(), "Should error on invalid UTF-8 in read name");
}
#[test]
fn test_valid_utf8_roundtrip() -> Result<()> {
let header = SamHeader::new("1.0");
let mut bam = BamFile::new(header);
bam.add_reference("chromosome_1".to_string(), 1000000);
bam.add_reference("chr2_fragment".to_string(), 2000000);
let bytes = bam.to_bytes()?;
let restored = BamFile::from_bytes(&bytes)?;
assert_eq!(restored.references.len(), 2);
assert_eq!(restored.references[0].0, "chromosome_1");
assert_eq!(restored.references[1].0, "chr2_fragment");
Ok(())
}
}