mod bin;
mod cigar;
pub mod data;
mod flags;
mod mapping_quality;
mod name;
pub(crate) mod num;
mod position;
mod quality_scores;
mod reference_sequence_id;
mod sequence;
mod template_length;
use std::{error, fmt, io};
use noodles::sam::{
self as sam,
alignment::{Record, RecordBuf},
};
pub use self::cigar::write_cigar;
use self::{
bin::write_bin,
cigar::overflowing_write_cigar_op_count,
cigar::write_cigar_from_slice,
data::write_data,
data::write_data_fast,
flags::write_flags,
mapping_quality::write_mapping_quality,
name::write_name,
position::write_position,
quality_scores::{write_quality_scores, write_quality_scores_from_slice},
reference_sequence_id::write_reference_sequence_id,
sequence::{write_sequence, write_sequence_from_slice},
template_length::write_template_length,
};
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum EncodeError {
InvalidReferenceSequenceId(reference_sequence_id::EncodeError),
InvalidAlignmentStart(position::EncodeError),
InvalidMateReferenceSequenceId(reference_sequence_id::EncodeError),
InvalidMateAlignmentStart(position::EncodeError),
}
impl error::Error for EncodeError {
fn source(&self) -> Option<&(dyn error::Error + 'static)> {
#[allow(clippy::match_same_arms)]
match self {
Self::InvalidReferenceSequenceId(e) => Some(e),
Self::InvalidAlignmentStart(e) => Some(e),
Self::InvalidMateReferenceSequenceId(e) => Some(e),
Self::InvalidMateAlignmentStart(e) => Some(e),
}
}
}
impl fmt::Display for EncodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidReferenceSequenceId(_) => write!(f, "invalid reference sequence ID"),
Self::InvalidAlignmentStart(_) => write!(f, "invalid alignment start"),
Self::InvalidMateReferenceSequenceId(_) => {
write!(f, "invalid mate reference sequence ID")
}
Self::InvalidMateAlignmentStart(_) => write!(f, "invalid mate alignment start"),
}
}
}
#[inline]
fn estimate_record_size_fast<R>(record: &R) -> usize
where
R: Record + ?Sized,
{
const FIXED_OVERHEAD: usize = 32 + 32 + 16;
let seq_len = record.sequence().len();
FIXED_OVERHEAD + seq_len + seq_len.div_ceil(2) + 64
}
#[inline]
pub fn encode_with_prealloc<R>(
dst: &mut Vec<u8>,
header: &sam::Header,
record: &R,
) -> io::Result<()>
where
R: Record + ?Sized,
{
let estimated_size = estimate_record_size_fast(record);
let current_len = dst.len();
let available = dst.capacity() - current_len;
if available < estimated_size {
dst.reserve(estimated_size - available);
}
encode(dst, header, record)
}
#[inline]
pub fn encode_record_buf(
dst: &mut Vec<u8>,
header: &sam::Header,
record: &RecordBuf,
) -> io::Result<()> {
let estimated_size = estimate_record_size_fast(record);
let current_len = dst.len();
let available = dst.capacity() - current_len;
if available < estimated_size {
dst.reserve(estimated_size - available);
}
let reference_sequence_id = record.reference_sequence_id();
write_reference_sequence_id(dst, header, reference_sequence_id)
.map_err(EncodeError::InvalidReferenceSequenceId)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let alignment_start = record.alignment_start();
write_position(dst, alignment_start)
.map_err(EncodeError::InvalidAlignmentStart)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
name::write_length(dst, record.name())?;
let mapping_quality = record.mapping_quality();
write_mapping_quality(dst, mapping_quality);
let alignment_end = record.alignment_end();
write_bin(dst, alignment_start, alignment_end);
let base_count = record.sequence().len();
let cigar = overflowing_write_cigar_op_count(dst, base_count, record.cigar())?;
let flags = record.flags();
write_flags(dst, flags);
sequence::write_length(dst, base_count)?;
let mate_reference_sequence_id = record.mate_reference_sequence_id();
write_reference_sequence_id(dst, header, mate_reference_sequence_id)
.map_err(EncodeError::InvalidMateReferenceSequenceId)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let mate_alignment_start = record.mate_alignment_start();
write_position(dst, mate_alignment_start)
.map_err(EncodeError::InvalidMateAlignmentStart)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let template_length = record.template_length();
write_template_length(dst, template_length);
write_name(dst, record.name())?;
if let Some(placeholder_cigar) = &cigar {
write_cigar(dst, placeholder_cigar)?;
} else {
let cigar_ops: &[sam::alignment::record::cigar::Op] = record.cigar().as_ref();
write_cigar_from_slice(dst, cigar_ops)?;
}
let seq_bytes: &[u8] = record.sequence().as_ref();
let read_length = record.cigar().read_length();
write_sequence_from_slice(dst, read_length, seq_bytes)?;
let qual_scores: &[u8] = record.quality_scores().as_ref();
write_quality_scores_from_slice(dst, base_count, qual_scores)?;
write_data_fast(dst, record.data())?;
if cigar.is_some() {
data::field::write_cigar(dst, record.cigar())?;
}
Ok(())
}
pub fn encode<R>(dst: &mut Vec<u8>, header: &sam::Header, record: &R) -> io::Result<()>
where
R: Record + ?Sized,
{
let reference_sequence_id = record.reference_sequence_id(header).transpose()?;
write_reference_sequence_id(dst, header, reference_sequence_id)
.map_err(EncodeError::InvalidReferenceSequenceId)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let alignment_start = record.alignment_start().transpose()?;
write_position(dst, alignment_start)
.map_err(EncodeError::InvalidAlignmentStart)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
name::write_length(dst, record.name())?;
let mapping_quality = record.mapping_quality().transpose()?;
write_mapping_quality(dst, mapping_quality);
let alignment_end = record.alignment_end().transpose()?;
write_bin(dst, alignment_start, alignment_end);
let base_count = record.sequence().len();
let cigar = overflowing_write_cigar_op_count(dst, base_count, &record.cigar())?;
let flags = record.flags()?;
write_flags(dst, flags);
sequence::write_length(dst, base_count)?;
let mate_reference_sequence_id = record.mate_reference_sequence_id(header).transpose()?;
write_reference_sequence_id(dst, header, mate_reference_sequence_id)
.map_err(EncodeError::InvalidMateReferenceSequenceId)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let mate_alignment_start = record.mate_alignment_start().transpose()?;
write_position(dst, mate_alignment_start)
.map_err(EncodeError::InvalidMateAlignmentStart)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
let template_length = record.template_length()?;
write_template_length(dst, template_length);
write_name(dst, record.name())?;
if let Some(cigar) = &cigar {
write_cigar(dst, cigar)?;
} else {
write_cigar(dst, &record.cigar())?;
}
let sequence = record.sequence();
let read_length = record.cigar().read_length()?;
write_sequence(dst, read_length, sequence)?;
write_quality_scores(dst, base_count, record.quality_scores())?;
write_data(dst, record.data())?;
if cigar.is_some() {
data::field::write_cigar(dst, &record.cigar())?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::num::NonZero;
use noodles::core::Position;
use noodles::sam::{
alignment::RecordBuf,
header::record::value::{Map, map::ReferenceSequence},
};
use super::*;
#[test]
fn test_encode_with_default_fields() -> Result<(), Box<dyn std::error::Error>> {
let mut buf = Vec::new();
let header = sam::Header::default();
let record = RecordBuf::default();
encode(&mut buf, &header, &record)?;
let expected = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x02, 0xff, 0x48, 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x00, ];
assert_eq!(buf, expected);
Ok(())
}
#[test]
fn test_encode_with_all_fields() -> Result<(), Box<dyn std::error::Error>> {
use sam::alignment::{
record::{
Flags, MappingQuality,
cigar::{Op, op::Kind},
data::field::Tag,
},
record_buf::{QualityScores, Sequence, data::field::Value},
};
let mut buf = Vec::new();
let header = sam::Header::builder()
.add_reference_sequence(
"sq0",
Map::<ReferenceSequence>::new(const { NonZero::new(8).expect("non-zero value 8") }),
)
.add_reference_sequence(
"sq1",
Map::<ReferenceSequence>::new(
const { NonZero::new(13).expect("non-zero value 13") },
),
)
.build();
let record = RecordBuf::builder()
.set_name("r0")
.set_flags(Flags::SEGMENTED | Flags::FIRST_SEGMENT)
.set_reference_sequence_id(1)
.set_alignment_start(Position::try_from(9)?)
.set_mapping_quality(MappingQuality::try_from(13)?)
.set_cigar([Op::new(Kind::Match, 3), Op::new(Kind::SoftClip, 1)].into_iter().collect())
.set_mate_reference_sequence_id(1)
.set_mate_alignment_start(Position::try_from(22)?)
.set_template_length(144)
.set_sequence(Sequence::from(b"ACGT"))
.set_quality_scores(QualityScores::from(vec![45, 35, 43, 50]))
.set_data([(Tag::ALIGNMENT_HIT_COUNT, Value::from(1))].into_iter().collect())
.build();
encode(&mut buf, &header, &record)?;
let expected = [
0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x0d, 0x49, 0x12, 0x02, 0x00, 0x41, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, b'r', b'0', 0x00, 0x30, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x12, 0x48, 0x2d, 0x23, 0x2b, 0x32, b'N', b'H', b'C', 0x01, ];
assert_eq!(buf, expected);
Ok(())
}
#[test]
fn test_encode_with_oversized_cigar() -> Result<(), Box<dyn std::error::Error>> {
use sam::alignment::{
record::{
Flags,
cigar::{Op, op::Kind},
data::field::Tag,
},
record_buf::{Cigar, Sequence, data::field::Value},
};
const BASE_COUNT: usize = 65536;
let mut buf = Vec::new();
let header = sam::Header::builder()
.add_reference_sequence(
"sq0",
Map::<ReferenceSequence>::new(
const { NonZero::new(131_072).expect("non-zero value 131_072") },
),
)
.build();
let cigar = Cigar::from(vec![Op::new(Kind::Match, 1); BASE_COUNT]);
let sequence = Sequence::from(vec![b'A'; BASE_COUNT]);
let record = RecordBuf::builder()
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::MIN)
.set_cigar(cigar)
.set_sequence(sequence)
.set_data([(Tag::ALIGNMENT_HIT_COUNT, Value::from(1))].into_iter().collect())
.build();
encode(&mut buf, &header, &record)?;
let mut expected = vec![
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x49, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, b'*', 0x00, 0x04, 0x00, 0x10, 0x00, 0x03, 0x00, 0x10, 0x00, ];
expected.resize(expected.len() + BASE_COUNT.div_ceil(2), 0x11); expected.resize(expected.len() + BASE_COUNT, 0xff); expected.extend([b'N', b'H', b'C', 0x01]);
expected.extend([b'C', b'G', b'B', b'I', 0x00, 0x00, 0x01, 0x00]);
expected.extend((0..BASE_COUNT).flat_map(|_| {
[
0x10, 0x00, 0x00, 0x00, ]
}));
assert_eq!(buf, expected);
Ok(())
}
}