use std::collections::BTreeSet;
use anyhow::{Context, Result, anyhow};
use bstr::BStr;
use noodles::bam;
use noodles::core::Position;
use noodles::sam;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::cigar::Op;
use noodles::sam::alignment::record::cigar::op::Kind;
use noodles::sam::alignment::record::{Flags, MappingQuality};
use rust_htslib::bam::record::{Aux as HtsAux, Record as HtsRecord};
use smallvec::SmallVec;
pub type TagKey = [u8; 2];
#[allow(clippy::large_enum_variant, reason = "see docstring above")]
#[derive(Clone)]
pub enum RikerRecord {
Bam(BamRec),
Fallback(FallbackRec),
Htslib(HtslibRec),
}
impl RikerRecord {
pub fn from_alignment_record<R: sam::alignment::Record>(
header: &sam::Header,
record: &R,
) -> Result<Self> {
let buf = RecordBuf::try_from_alignment_record(header, record)
.context("Failed to convert alignment record to RecordBuf")?;
Ok(Self::Fallback(FallbackRec::from_record_buf(buf)))
}
#[inline]
#[must_use]
pub fn flags(&self) -> Flags {
match self {
Self::Bam(r) => r.flags,
Self::Fallback(r) => r.inner.flags(),
Self::Htslib(r) => r.flags(),
}
}
#[inline]
#[must_use]
pub fn reference_sequence_id(&self) -> Option<usize> {
match self {
Self::Bam(r) => r.ref_id,
Self::Fallback(r) => r.inner.reference_sequence_id(),
Self::Htslib(r) => r.reference_sequence_id(),
}
}
#[inline]
#[must_use]
pub fn alignment_start(&self) -> Option<Position> {
match self {
Self::Bam(r) => r.pos,
Self::Fallback(r) => r.inner.alignment_start(),
Self::Htslib(r) => r.alignment_start(),
}
}
#[inline]
#[must_use]
pub fn alignment_end(&self) -> Option<Position> {
match self {
Self::Bam(r) => r.alignment_end,
Self::Fallback(r) => r.alignment_end,
Self::Htslib(r) => r.alignment_end(),
}
}
#[inline]
#[must_use]
pub fn mapping_quality(&self) -> Option<MappingQuality> {
match self {
Self::Bam(r) => r.mapq,
Self::Fallback(r) => r.inner.mapping_quality(),
Self::Htslib(r) => r.mapping_quality(),
}
}
#[inline]
#[must_use]
pub fn mate_reference_sequence_id(&self) -> Option<usize> {
match self {
Self::Bam(r) => r.mate_ref_id,
Self::Fallback(r) => r.inner.mate_reference_sequence_id(),
Self::Htslib(r) => r.mate_reference_sequence_id(),
}
}
#[inline]
#[must_use]
pub fn mate_alignment_start(&self) -> Option<Position> {
match self {
Self::Bam(r) => r.mate_pos,
Self::Fallback(r) => r.inner.mate_alignment_start(),
Self::Htslib(r) => r.mate_alignment_start(),
}
}
#[inline]
#[must_use]
pub fn template_length(&self) -> i32 {
match self {
Self::Bam(r) => r.tlen,
Self::Fallback(r) => r.inner.template_length(),
Self::Htslib(r) => r.template_length(),
}
}
#[inline]
#[must_use]
pub fn name(&self) -> Option<&BStr> {
match self {
Self::Bam(r) => r.inner.name(),
Self::Fallback(r) => r.inner.name(),
Self::Htslib(r) => r.name(),
}
}
#[inline]
#[must_use]
pub fn quality_scores(&self) -> &[u8] {
match self {
Self::Bam(r) => r.inner.quality_scores().as_bytes(),
Self::Fallback(r) => r.inner.quality_scores().as_ref(),
Self::Htslib(r) => r.quality_scores(),
}
}
#[must_use]
pub fn cigar_ops(&self) -> CigarOps<'_> {
match self {
Self::Bam(r) => CigarOps::Bam(r.inner.cigar().as_bytes().chunks_exact(4)),
Self::Fallback(r) => CigarOps::Fallback(r.inner.cigar().as_ref().iter()),
Self::Htslib(r) => CigarOps::Bam(r.cigar_bytes().chunks_exact(4)),
}
}
#[must_use]
pub fn cigar_len(&self) -> usize {
match self {
Self::Bam(r) => r.inner.cigar().len(),
Self::Fallback(r) => r.inner.cigar().as_ref().len(),
Self::Htslib(r) => r.cigar_len(),
}
}
#[must_use]
pub fn sequence_len(&self) -> usize {
match self {
Self::Bam(r) => r.inner.sequence().len(),
Self::Fallback(r) => r.inner.sequence().len(),
Self::Htslib(r) => r.sequence_len(),
}
}
#[must_use]
pub fn sequence(&self) -> &[u8] {
match self {
Self::Bam(r) => {
if r.sequence_populated { r.sequence_buf.as_slice() } else { &[] }
}
Self::Fallback(r) => r.inner.sequence().as_ref(),
Self::Htslib(r) => r.sequence(),
}
}
#[must_use]
pub fn aux_tag(&self, tag: TagKey) -> Option<AuxValue> {
match self {
Self::Bam(r) => {
if r.aux_populated {
r.aux_store.get(tag).cloned()
} else {
None
}
}
Self::Fallback(r) => {
let tag_obj = sam::alignment::record::data::field::Tag::from(tag);
let value = r.inner.data().get(&tag_obj)?;
record_buf_value_to_aux(value)
}
Self::Htslib(r) => r.aux_tag(tag),
}
}
#[must_use]
pub fn has_aux_tag(&self, tag: TagKey) -> bool {
match self {
Self::Bam(r) => {
r.aux_populated && (r.aux_store.get(tag).is_some() || r.aux_present.contains(&tag))
}
Self::Fallback(r) => {
let tag_obj = sam::alignment::record::data::field::Tag::from(tag);
r.inner.data().get(&tag_obj).is_some()
}
Self::Htslib(r) => r.has_aux_tag(tag),
}
}
}
#[derive(Clone)]
pub struct BamRec {
inner: bam::Record,
flags: Flags,
ref_id: Option<usize>,
pos: Option<Position>,
mapq: Option<MappingQuality>,
mate_ref_id: Option<usize>,
mate_pos: Option<Position>,
tlen: i32,
alignment_end: Option<Position>,
sequence_buf: Vec<u8>,
sequence_populated: bool,
aux_store: AuxTagStore,
aux_populated: bool,
aux_present: SmallVec<[TagKey; 4]>,
}
impl BamRec {
#[must_use]
pub(crate) fn new() -> Self {
Self {
inner: bam::Record::default(),
flags: Flags::empty(),
ref_id: None,
pos: None,
mapq: None,
mate_ref_id: None,
mate_pos: None,
tlen: 0,
alignment_end: None,
sequence_buf: Vec::new(),
sequence_populated: false,
aux_store: AuxTagStore::new(),
aux_populated: false,
aux_present: SmallVec::new(),
}
}
pub(crate) fn read_from<R: std::io::Read>(
&mut self,
reader: &mut bam::io::Reader<R>,
) -> Result<usize> {
let n = reader.read_record(&mut self.inner).context("Failed to read BAM record")?;
if n == 0 {
return Ok(0);
}
self.refresh_cache()?;
Ok(n)
}
pub(crate) fn install(&mut self, record: bam::Record) -> Result<()> {
self.inner = record;
self.refresh_cache()
}
pub(crate) fn decode_sequence(&mut self) {
let seq_view = self.inner.sequence();
let packed: &[u8] = seq_view.as_ref();
super::simd_seq::decode_packed_sequence_into(
packed,
seq_view.len(),
&mut self.sequence_buf,
);
self.sequence_populated = true;
}
pub(crate) fn scan_aux_tags(
&mut self,
values: &BTreeSet<TagKey>,
presence: &BTreeSet<TagKey>,
) -> Result<()> {
let aux = self.inner.data();
let aux_bytes: &[u8] = aux.as_ref();
scan_specific(aux_bytes, values, presence, &mut self.aux_store, &mut self.aux_present)
.context("Failed to scan aux tags")?;
self.aux_populated = true;
Ok(())
}
pub(crate) fn decode_all_aux(&mut self) -> Result<()> {
let aux = self.inner.data();
let aux_bytes: &[u8] = aux.as_ref();
scan_all(aux_bytes, &mut self.aux_store).context("Failed to decode all aux tags")?;
self.aux_populated = true;
Ok(())
}
fn refresh_cache(&mut self) -> Result<()> {
self.flags = self.inner.flags();
self.ref_id = match self.inner.reference_sequence_id() {
None => None,
Some(res) => Some(res.context("invalid reference sequence id")?),
};
self.pos = match self.inner.alignment_start() {
None => None,
Some(res) => Some(res.context("invalid alignment start")?),
};
self.mapq = self.inner.mapping_quality();
self.mate_ref_id = match self.inner.mate_reference_sequence_id() {
None => None,
Some(res) => Some(res.context("invalid mate reference sequence id")?),
};
self.mate_pos = match self.inner.mate_alignment_start() {
None => None,
Some(res) => Some(res.context("invalid mate alignment start")?),
};
self.tlen = self.inner.template_length();
let cigar_bytes = self.inner.cigar().as_bytes();
validate_cigar_codes(cigar_bytes)?;
self.alignment_end = compute_alignment_end(self.pos, cigar_bytes)?;
self.sequence_populated = false;
self.aux_populated = false;
self.aux_store.clear();
self.aux_present.clear();
Ok(())
}
}
#[derive(Clone)]
pub struct FallbackRec {
inner: RecordBuf,
alignment_end: Option<Position>,
}
impl FallbackRec {
#[must_use]
pub(crate) fn from_record_buf(inner: RecordBuf) -> Self {
let alignment_end = inner.alignment_end();
Self { inner, alignment_end }
}
#[must_use]
pub(crate) fn empty() -> Self {
Self { inner: RecordBuf::default(), alignment_end: None }
}
pub(crate) fn inner_mut(&mut self) -> &mut RecordBuf {
&mut self.inner
}
pub(crate) fn refresh_cache(&mut self) {
self.alignment_end = self.inner.alignment_end();
}
}
#[cfg(target_endian = "little")]
const _: () = ();
#[cfg(not(target_endian = "little"))]
compile_error!(
"HtslibRec assumes a little-endian target (htslib raw_cigar() is native-endian u32)"
);
#[derive(Clone)]
pub struct HtslibRec {
inner: HtsRecord,
alignment_end: Option<Position>,
sequence_buf: Vec<u8>,
sequence_populated: bool,
aux_store: AuxTagStore,
aux_populated: bool,
aux_present: SmallVec<[TagKey; 4]>,
}
impl HtslibRec {
#[must_use]
pub(crate) fn new() -> Self {
Self {
inner: HtsRecord::new(),
alignment_end: None,
sequence_buf: Vec::new(),
sequence_populated: false,
aux_store: AuxTagStore::new(),
aux_populated: false,
aux_present: SmallVec::new(),
}
}
pub(crate) fn read_from<R: rust_htslib::bam::Read>(&mut self, reader: &mut R) -> Result<bool> {
match reader.read(&mut self.inner) {
None => Ok(false),
Some(result) => {
result.context("Failed to read CRAM record")?;
self.refresh_cache()?;
Ok(true)
}
}
}
pub(crate) fn decode_sequence(&mut self) {
let seq = self.inner.seq();
super::simd_seq::decode_packed_sequence_into(
seq.encoded,
seq.len(),
&mut self.sequence_buf,
);
self.sequence_populated = true;
}
pub(crate) fn scan_aux_tags(
&mut self,
values: &BTreeSet<TagKey>,
presence: &BTreeSet<TagKey>,
) -> Result<()> {
for result in self.inner.aux_iter() {
let (tag_slice, aux) = result.context("Failed to parse aux field on CRAM record")?;
let tag: TagKey = tag_slice.try_into().context("aux tag was not exactly 2 bytes")?;
if values.contains(&tag) {
if let Some(value) = htslib_aux_to_value(&aux) {
self.aux_store.insert(tag, value);
}
} else if presence.contains(&tag) && !self.aux_present.contains(&tag) {
self.aux_present.push(tag);
}
}
self.aux_populated = true;
Ok(())
}
pub(crate) fn decode_all_aux(&mut self) -> Result<()> {
for result in self.inner.aux_iter() {
let (tag_slice, aux) = result.context("Failed to parse aux field on CRAM record")?;
let tag: TagKey = tag_slice.try_into().context("aux tag was not exactly 2 bytes")?;
if let Some(value) = htslib_aux_to_value(&aux) {
self.aux_store.insert(tag, value);
}
}
self.aux_populated = true;
Ok(())
}
fn refresh_cache(&mut self) -> Result<()> {
let cigar_bytes = self.cigar_bytes();
validate_cigar_codes(cigar_bytes)?;
self.alignment_end = compute_alignment_end(self.alignment_start(), cigar_bytes)?;
self.sequence_populated = false;
self.aux_populated = false;
self.aux_store.clear();
self.aux_present.clear();
Ok(())
}
pub(crate) fn flags(&self) -> Flags {
Flags::from_bits_truncate(self.inner.flags())
}
pub(crate) fn reference_sequence_id(&self) -> Option<usize> {
let tid = self.inner.tid();
if tid < 0 { None } else { usize::try_from(tid).ok() }
}
pub(crate) fn alignment_start(&self) -> Option<Position> {
let pos = self.inner.pos();
if pos < 0 {
return None;
}
usize::try_from(pos).ok().and_then(|p| p.checked_add(1)).and_then(Position::new)
}
pub(crate) fn alignment_end(&self) -> Option<Position> {
self.alignment_end
}
pub(crate) fn mapping_quality(&self) -> Option<MappingQuality> {
MappingQuality::new(self.inner.mapq())
}
pub(crate) fn mate_reference_sequence_id(&self) -> Option<usize> {
let tid = self.inner.mtid();
if tid < 0 { None } else { usize::try_from(tid).ok() }
}
pub(crate) fn mate_alignment_start(&self) -> Option<Position> {
let pos = self.inner.mpos();
if pos < 0 {
return None;
}
usize::try_from(pos).ok().and_then(|p| p.checked_add(1)).and_then(Position::new)
}
pub(crate) fn template_length(&self) -> i32 {
let clamped = self.inner.insert_size().clamp(i64::from(i32::MIN), i64::from(i32::MAX));
i32::try_from(clamped).expect("value clamped to i32 range")
}
pub(crate) fn name(&self) -> Option<&BStr> {
let q = self.inner.qname();
if q.is_empty() || q == b"*" { None } else { Some(BStr::new(q)) }
}
pub(crate) fn quality_scores(&self) -> &[u8] {
let q = self.inner.qual();
if !q.is_empty() && q.iter().all(|&b| b == 0xFF) { &[] } else { q }
}
pub(crate) fn cigar_bytes(&self) -> &[u8] {
bytemuck::cast_slice(self.inner.raw_cigar())
}
pub(crate) fn cigar_len(&self) -> usize {
self.inner.cigar_len()
}
pub(crate) fn sequence_len(&self) -> usize {
self.inner.seq_len()
}
pub(crate) fn sequence(&self) -> &[u8] {
if self.sequence_populated { self.sequence_buf.as_slice() } else { &[] }
}
pub(crate) fn aux_tag(&self, tag: TagKey) -> Option<AuxValue> {
if self.aux_populated { self.aux_store.get(tag).cloned() } else { None }
}
pub(crate) fn has_aux_tag(&self, tag: TagKey) -> bool {
self.aux_populated && (self.aux_store.get(tag).is_some() || self.aux_present.contains(&tag))
}
}
fn htslib_aux_to_value(aux: &HtsAux<'_>) -> Option<AuxValue> {
Some(match *aux {
HtsAux::Char(c) => AuxValue::Character(c),
HtsAux::I8(n) => AuxValue::Int(i64::from(n)),
HtsAux::U8(n) => AuxValue::Int(i64::from(n)),
HtsAux::I16(n) => AuxValue::Int(i64::from(n)),
HtsAux::U16(n) => AuxValue::Int(i64::from(n)),
HtsAux::I32(n) => AuxValue::Int(i64::from(n)),
HtsAux::U32(n) => AuxValue::Int(i64::from(n)),
HtsAux::Float(f) => AuxValue::Float(f),
#[allow(clippy::cast_possible_truncation, reason = "see fn doc")]
HtsAux::Double(f) => AuxValue::Float(f as f32),
HtsAux::String(s) => AuxValue::String(s.as_bytes().to_vec()),
HtsAux::HexByteArray(s) => AuxValue::Hex(s.as_bytes().to_vec()),
HtsAux::ArrayI8(_)
| HtsAux::ArrayU8(_)
| HtsAux::ArrayI16(_)
| HtsAux::ArrayU16(_)
| HtsAux::ArrayI32(_)
| HtsAux::ArrayU32(_)
| HtsAux::ArrayFloat(_) => return None,
})
}
pub enum CigarOps<'a> {
Bam(std::slice::ChunksExact<'a, u8>),
Fallback(std::slice::Iter<'a, Op>),
}
impl Iterator for CigarOps<'_> {
type Item = Op;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self {
Self::Bam(chunks) => chunks.next().map(|c| {
let word = u32::from_le_bytes([c[0], c[1], c[2], c[3]]);
let kind = code_to_kind(word & 0xF);
#[allow(clippy::cast_possible_truncation)]
let len = (word >> 4) as usize;
Op::new(kind, len)
}),
Self::Fallback(iter) => iter.next().copied(),
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
match self {
Self::Bam(chunks) => chunks.size_hint(),
Self::Fallback(iter) => iter.size_hint(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum AuxValue {
Character(u8),
Int(i64),
Float(f32),
String(Vec<u8>),
Hex(Vec<u8>),
}
impl AuxValue {
#[inline]
#[must_use]
pub fn as_int(&self) -> Option<i64> {
if let Self::Int(n) = self { Some(*n) } else { None }
}
#[inline]
#[must_use]
pub fn as_float(&self) -> Option<f32> {
if let Self::Float(f) = self { Some(*f) } else { None }
}
#[inline]
#[must_use]
pub fn as_char(&self) -> Option<u8> {
if let Self::Character(c) = self { Some(*c) } else { None }
}
#[inline]
#[must_use]
pub fn as_str(&self) -> Option<&BStr> {
if let Self::String(bytes) = self { Some(BStr::new(bytes.as_slice())) } else { None }
}
#[inline]
#[must_use]
pub fn as_hex(&self) -> Option<&BStr> {
if let Self::Hex(bytes) = self { Some(BStr::new(bytes.as_slice())) } else { None }
}
}
#[derive(Debug, Default, Clone)]
pub(crate) struct AuxTagStore {
tags: SmallVec<[(TagKey, AuxValue); 4]>,
}
impl AuxTagStore {
#[must_use]
pub(crate) fn new() -> Self {
Self::default()
}
pub(crate) fn clear(&mut self) {
self.tags.clear();
}
pub(crate) fn insert(&mut self, tag: TagKey, value: AuxValue) {
if let Some((_, existing)) = self.tags.iter_mut().find(|(k, _)| *k == tag) {
*existing = value;
} else {
self.tags.push((tag, value));
}
}
#[must_use]
pub(crate) fn get(&self, tag: TagKey) -> Option<&AuxValue> {
self.tags.iter().find(|(k, _)| *k == tag).map(|(_, v)| v)
}
#[cfg(test)]
pub(crate) fn len(&self) -> usize {
self.tags.len()
}
}
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct RikerRecordRequirements {
pub(crate) sequence: bool,
pub(crate) aux_tags: AuxTagRequirements,
}
impl RikerRecordRequirements {
pub const NONE: Self = Self { sequence: false, aux_tags: AuxTagRequirements::None };
#[must_use]
pub fn needs_sequence(&self) -> bool {
self.sequence
}
#[must_use]
pub fn aux_tags(&self) -> &AuxTagRequirements {
&self.aux_tags
}
#[must_use]
pub fn everything() -> Self {
Self { sequence: true, aux_tags: AuxTagRequirements::All }
}
#[must_use]
pub fn with_sequence(mut self) -> Self {
self.sequence = true;
self
}
#[must_use]
pub fn with_aux_tag(mut self, tag: TagKey) -> Self {
self.aux_tags = self.aux_tags.with(tag);
self
}
#[must_use]
pub fn with_aux_tag_presence(mut self, tag: TagKey) -> Self {
self.aux_tags = self.aux_tags.with_presence(tag);
self
}
#[must_use]
pub fn with_all_aux_tags(mut self) -> Self {
self.aux_tags = AuxTagRequirements::All;
self
}
#[must_use]
pub fn union(self, other: Self) -> Self {
Self {
sequence: self.sequence || other.sequence,
aux_tags: self.aux_tags.union(other.aux_tags),
}
}
}
impl Default for RikerRecordRequirements {
fn default() -> Self {
Self::NONE
}
}
#[derive(Debug, Clone, Default, Eq, PartialEq)]
pub enum AuxTagRequirements {
#[default]
None,
Specific {
values: BTreeSet<TagKey>,
presence: BTreeSet<TagKey>,
},
All,
}
impl AuxTagRequirements {
#[must_use]
pub fn with(self, tag: TagKey) -> Self {
match self {
Self::None => {
Self::Specific { values: BTreeSet::from([tag]), presence: BTreeSet::new() }
}
Self::Specific { mut values, mut presence } => {
values.insert(tag);
presence.remove(&tag);
Self::Specific { values, presence }
}
Self::All => Self::All,
}
}
#[must_use]
pub fn with_presence(self, tag: TagKey) -> Self {
match self {
Self::None => {
Self::Specific { values: BTreeSet::new(), presence: BTreeSet::from([tag]) }
}
Self::Specific { values, mut presence } => {
if !values.contains(&tag) {
presence.insert(tag);
}
Self::Specific { values, presence }
}
Self::All => Self::All,
}
}
#[must_use]
pub fn union(self, other: Self) -> Self {
match (self, other) {
(Self::All, _) | (_, Self::All) => Self::All,
(Self::None, x) | (x, Self::None) => x,
(
Self::Specific { values: mut va, presence: mut pa },
Self::Specific { values: vb, presence: pb },
) => {
va.extend(vb);
pa.extend(pb);
pa.retain(|t| !va.contains(t));
Self::Specific { values: va, presence: pa }
}
}
}
#[must_use]
pub fn is_none(&self) -> bool {
matches!(self, Self::None)
}
}
fn validate_cigar_codes(cigar_bytes: &[u8]) -> Result<()> {
let (chunks, []) = cigar_bytes.as_chunks::<4>() else {
return Err(anyhow!("cigar buffer length {} is not a multiple of 4", cigar_bytes.len()));
};
for chunk in chunks {
let code = u32::from_le_bytes(*chunk) & 0xF;
if code > 8 {
return Err(anyhow!("invalid BAM CIGAR op code: {code} (valid range is 0..=8)"));
}
}
Ok(())
}
fn cigar_reference_span(cigar_bytes: &[u8]) -> Result<u32> {
let (chunks, []) = cigar_bytes.as_chunks::<4>() else {
return Err(anyhow!("cigar buffer length {} is not a multiple of 4", cigar_bytes.len()));
};
let mut span: u32 = 0;
for chunk in chunks {
let word = u32::from_le_bytes(*chunk);
let kind = code_to_kind(word & 0xF);
let len = word >> 4;
if consumes_reference(kind) {
span = span.checked_add(len).ok_or_else(|| anyhow!("cigar ref span overflow"))?;
}
}
Ok(span)
}
fn compute_alignment_end(pos: Option<Position>, cigar_bytes: &[u8]) -> Result<Option<Position>> {
let Some(start) = pos else { return Ok(None) };
let span = cigar_reference_span(cigar_bytes)?;
if span == 0 {
return Ok(None);
}
let end_1based = usize::from(start)
.checked_add(span as usize)
.and_then(|v| v.checked_sub(1))
.and_then(Position::new);
Ok(end_1based)
}
#[inline]
fn code_to_kind(code: u32) -> Kind {
match code {
0 => Kind::Match,
1 => Kind::Insertion,
2 => Kind::Deletion,
3 => Kind::Skip,
4 => Kind::SoftClip,
5 => Kind::HardClip,
6 => Kind::Pad,
7 => Kind::SequenceMatch,
8 => Kind::SequenceMismatch,
n => {
panic!("invalid BAM CIGAR op code: {n} (validate_cigar_codes should have rejected it)")
}
}
}
#[inline]
fn consumes_reference(kind: Kind) -> bool {
matches!(
kind,
Kind::Match | Kind::Deletion | Kind::Skip | Kind::SequenceMatch | Kind::SequenceMismatch
)
}
fn scan_specific(
aux_bytes: &[u8],
values: &BTreeSet<TagKey>,
presence: &BTreeSet<TagKey>,
dst: &mut AuxTagStore,
present: &mut SmallVec<[TagKey; 4]>,
) -> Result<usize> {
let mut pos = 0;
let mut found = 0;
while pos + 3 <= aux_bytes.len() {
let tag: TagKey = [aux_bytes[pos], aux_bytes[pos + 1]];
let type_byte = aux_bytes[pos + 2];
pos += 3;
let value_len = aux_value_len(aux_bytes, pos, type_byte)?;
let end = pos + value_len;
if end > aux_bytes.len() {
anyhow::bail!("aux value of type '{}' extends past end of buffer", type_byte as char);
}
if values.contains(&tag) {
if let Some(value) = parse_aux_value(type_byte, &aux_bytes[pos..end])? {
dst.insert(tag, value);
found += 1;
}
} else if presence.contains(&tag) && !present.contains(&tag) {
present.push(tag);
found += 1;
}
pos = end;
}
if pos != aux_bytes.len() {
anyhow::bail!("trailing garbage in aux buffer: {} stray bytes", aux_bytes.len() - pos);
}
Ok(found)
}
fn scan_all(aux_bytes: &[u8], dst: &mut AuxTagStore) -> Result<usize> {
let mut pos = 0;
let mut found = 0;
while pos + 3 <= aux_bytes.len() {
let tag: TagKey = [aux_bytes[pos], aux_bytes[pos + 1]];
let type_byte = aux_bytes[pos + 2];
pos += 3;
let value_len = aux_value_len(aux_bytes, pos, type_byte)?;
let end = pos + value_len;
if end > aux_bytes.len() {
anyhow::bail!("aux value of type '{}' extends past end of buffer", type_byte as char);
}
if let Some(value) = parse_aux_value(type_byte, &aux_bytes[pos..end])? {
dst.insert(tag, value);
found += 1;
}
pos = end;
}
if pos != aux_bytes.len() {
anyhow::bail!("trailing garbage in aux buffer: {} stray bytes", aux_bytes.len() - pos);
}
Ok(found)
}
fn aux_value_len(aux_bytes: &[u8], pos: usize, type_byte: u8) -> Result<usize> {
Ok(match type_byte {
b'A' | b'c' | b'C' => 1,
b's' | b'S' => 2,
b'i' | b'I' | b'f' => 4,
b'Z' | b'H' => {
let terminator = aux_bytes[pos..]
.iter()
.position(|&b| b == 0)
.ok_or_else(|| anyhow!("aux string missing NUL terminator"))?;
terminator + 1 }
b'B' => {
if pos + 5 > aux_bytes.len() {
anyhow::bail!("aux B-array header truncated");
}
let elem_ty = aux_bytes[pos];
let count = u32::from_le_bytes(aux_bytes[pos + 1..pos + 5].try_into().unwrap());
let elem_size: usize = match elem_ty {
b'c' | b'C' => 1,
b's' | b'S' => 2,
b'i' | b'I' | b'f' => 4,
_ => anyhow::bail!("unsupported aux B-array element type '{}'", elem_ty as char),
};
let payload_len = (count as usize).checked_mul(elem_size).ok_or_else(|| {
anyhow!("aux B-array size overflow: count={count}, elem_size={elem_size}")
})?;
payload_len.checked_add(5).ok_or_else(|| anyhow!("aux B-array length overflow"))?
}
_ => anyhow::bail!("unknown aux type byte '{}' ({:#x})", type_byte as char, type_byte),
})
}
fn parse_aux_value(type_byte: u8, value: &[u8]) -> Result<Option<AuxValue>> {
Ok(Some(match type_byte {
b'A' => AuxValue::Character(value[0]),
b'c' => AuxValue::Int(i64::from(i8::from_le_bytes([value[0]]))),
b'C' => AuxValue::Int(i64::from(value[0])),
b's' => AuxValue::Int(i64::from(i16::from_le_bytes(value.try_into().unwrap()))),
b'S' => AuxValue::Int(i64::from(u16::from_le_bytes(value.try_into().unwrap()))),
b'i' => AuxValue::Int(i64::from(i32::from_le_bytes(value.try_into().unwrap()))),
b'I' => AuxValue::Int(i64::from(u32::from_le_bytes(value.try_into().unwrap()))),
b'f' => AuxValue::Float(f32::from_le_bytes(value.try_into().unwrap())),
b'Z' => {
let s = if value.last() == Some(&0) { &value[..value.len() - 1] } else { value };
AuxValue::String(s.to_vec())
}
b'H' => {
let s = if value.last() == Some(&0) { &value[..value.len() - 1] } else { value };
AuxValue::Hex(s.to_vec())
}
b'B' => return Ok(None), _ => anyhow::bail!("unknown aux type byte '{}' ({:#x})", type_byte as char, type_byte),
}))
}
fn record_buf_value_to_aux(v: &sam::alignment::record_buf::data::field::Value) -> Option<AuxValue> {
use sam::alignment::record_buf::data::field::Value;
Some(match v {
Value::Character(c) => AuxValue::Character(*c),
Value::Int8(n) => AuxValue::Int(i64::from(*n)),
Value::UInt8(n) => AuxValue::Int(i64::from(*n)),
Value::Int16(n) => AuxValue::Int(i64::from(*n)),
Value::UInt16(n) => AuxValue::Int(i64::from(*n)),
Value::Int32(n) => AuxValue::Int(i64::from(*n)),
Value::UInt32(n) => AuxValue::Int(i64::from(*n)),
Value::Float(f) => AuxValue::Float(*f),
Value::String(s) => AuxValue::String(s.to_vec()),
Value::Hex(s) => AuxValue::Hex(s.to_vec()),
Value::Array(_) => return None,
})
}
#[cfg(test)]
mod tests {
use super::*;
use noodles::sam::alignment::record::cigar::op::Kind as OpKind;
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
fn make_record_buf_with_cigar(ops: Vec<Op>) -> RecordBuf {
let cigar: Cigar = ops.into_iter().collect();
RecordBuf::builder()
.set_name(b"r1".to_vec())
.set_flags(Flags::SEGMENTED | Flags::PROPERLY_SEGMENTED)
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(101).unwrap())
.set_mate_reference_sequence_id(0)
.set_mate_alignment_start(Position::new(201).unwrap())
.set_template_length(200)
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 100]))
.set_quality_scores(QualityScores::from(vec![30u8; 100]))
.build()
}
fn kind_code(kind: Kind) -> u32 {
match kind {
Kind::Match => 0,
Kind::Insertion => 1,
Kind::Deletion => 2,
Kind::Skip => 3,
Kind::SoftClip => 4,
Kind::HardClip => 5,
Kind::Pad => 6,
Kind::SequenceMatch => 7,
Kind::SequenceMismatch => 8,
}
}
fn encode_aux(fields: &[(TagKey, &str)]) -> Vec<u8> {
let mut out = Vec::new();
for (tag, spec) in fields {
out.extend_from_slice(tag);
let (type_byte, value) = spec.split_at(1);
let tb = type_byte.as_bytes()[0];
out.push(tb);
match tb {
b'i' => {
let n: i32 = value.parse().unwrap();
out.extend_from_slice(&n.to_le_bytes());
}
b'C' => {
let n: u8 = value.parse().unwrap();
out.push(n);
}
b'f' => {
let f: f32 = value.parse().unwrap();
out.extend_from_slice(&f.to_le_bytes());
}
b'Z' => {
out.extend_from_slice(value.as_bytes());
out.push(0);
}
b'A' => {
out.push(value.as_bytes()[0]);
}
_ => panic!("test encoder doesn't handle type '{}'", tb as char),
}
}
out
}
#[test]
fn fallback_scalar_accessors_are_infallible() {
let ops = vec![Op::new(OpKind::Match, 100)];
let buf = make_record_buf_with_cigar(ops);
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
assert!(record.flags().is_segmented());
assert_eq!(record.reference_sequence_id(), Some(0));
assert_eq!(record.alignment_start().unwrap().get(), 101);
assert_eq!(record.template_length(), 200);
assert_eq!(record.name().unwrap(), b"r1");
}
#[test]
fn fallback_alignment_end_computed_from_ref_span() {
let ops = vec![
Op::new(OpKind::Match, 50),
Op::new(OpKind::Insertion, 5),
Op::new(OpKind::Match, 45),
];
let buf = make_record_buf_with_cigar(ops);
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
assert_eq!(record.alignment_end().unwrap().get(), 195);
}
#[test]
fn fallback_sequence_is_always_populated() {
let ops = vec![Op::new(OpKind::Match, 100)];
let buf = make_record_buf_with_cigar(ops);
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
let seq = record.sequence();
assert_eq!(seq.len(), 100);
assert!(seq.iter().all(|&b| b == b'A'));
assert_eq!(record.sequence_len(), 100);
}
#[test]
fn fallback_cigar_iteration_matches_input() {
let ops = vec![
Op::new(OpKind::Match, 50),
Op::new(OpKind::Insertion, 5),
Op::new(OpKind::Match, 45),
];
let buf = make_record_buf_with_cigar(ops.clone());
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
let round_tripped: Vec<Op> = record.cigar_ops().collect();
assert_eq!(round_tripped, ops);
}
#[test]
fn fallback_aux_tag_converts_int_and_string_on_the_fly() {
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::alignment::record_buf::data::field::Value;
let mut buf = make_record_buf_with_cigar(vec![Op::new(OpKind::Match, 100)]);
buf.data_mut().insert(Tag::ALIGNMENT_HIT_COUNT, Value::Int32(3));
buf.data_mut().insert(Tag::MISMATCHED_POSITIONS, Value::String(b"50ACGT".into()));
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
assert_eq!(record.aux_tag(*b"NH").and_then(|v| v.as_int()), Some(3));
let md = record.aux_tag(*b"MD").unwrap();
assert_eq!(md.as_str().unwrap(), "50ACGT");
}
#[test]
fn fallback_aux_tag_returns_none_for_missing() {
let buf = make_record_buf_with_cigar(vec![Op::new(OpKind::Match, 100)]);
let record = RikerRecord::Fallback(FallbackRec::from_record_buf(buf));
assert!(record.aux_tag(*b"NM").is_none());
}
#[test]
fn cigar_reference_span_handles_deletion_and_skip() {
let packed = [
10u32 << 4, (5u32 << 4) | 2, (10u32 << 4) | 3, 10u32 << 4, ];
let bytes: Vec<u8> = packed.iter().flat_map(|w| w.to_le_bytes()).collect();
assert_eq!(cigar_reference_span(&bytes).unwrap(), 35);
}
#[test]
fn cigar_reference_span_rejects_non_multiple_of_four() {
let bytes = vec![0u8; 5];
assert!(cigar_reference_span(&bytes).is_err());
}
#[test]
fn validate_cigar_codes_rejects_out_of_range_op() {
let word = (10u32 << 4) | 9;
let bytes: Vec<u8> = word.to_le_bytes().to_vec();
let err = validate_cigar_codes(&bytes).unwrap_err();
assert!(
err.to_string().contains("invalid BAM CIGAR op code"),
"expected op-code error, got: {err}"
);
}
#[test]
fn validate_cigar_codes_accepts_all_valid_codes() {
let bytes: Vec<u8> = (0u32..=8).flat_map(|c| ((1u32 << 4) | c).to_le_bytes()).collect();
validate_cigar_codes(&bytes).unwrap();
}
#[test]
fn kind_code_round_trips() {
for kind in [
OpKind::Match,
OpKind::Insertion,
OpKind::Deletion,
OpKind::Skip,
OpKind::SoftClip,
OpKind::HardClip,
OpKind::Pad,
OpKind::SequenceMatch,
OpKind::SequenceMismatch,
] {
assert_eq!(code_to_kind(kind_code(kind)), kind);
}
}
fn scan_values(aux: &[u8], wanted: &BTreeSet<TagKey>, dst: &mut AuxTagStore) -> Result<usize> {
let presence: BTreeSet<TagKey> = BTreeSet::new();
let mut present: SmallVec<[TagKey; 4]> = SmallVec::new();
scan_specific(aux, wanted, &presence, dst, &mut present)
}
#[test]
fn scan_extracts_requested_int_tag_ignoring_others() {
let aux = encode_aux(&[(*b"NM", "i3"), (*b"MD", "Z50"), (*b"AS", "i100")]);
let wanted: BTreeSet<TagKey> = [*b"NM"].into_iter().collect();
let mut dst = AuxTagStore::new();
let found = scan_values(&aux, &wanted, &mut dst).unwrap();
assert_eq!(found, 1);
assert_eq!(dst.get(*b"NM").unwrap().as_int(), Some(3));
assert!(dst.get(*b"MD").is_none());
assert!(dst.get(*b"AS").is_none());
}
#[test]
fn scan_extracts_multiple_tags() {
let aux = encode_aux(&[(*b"NM", "i3"), (*b"MD", "Z50ACGT"), (*b"AS", "i100")]);
let wanted: BTreeSet<TagKey> = [*b"NM", *b"AS"].into_iter().collect();
let mut dst = AuxTagStore::new();
let found = scan_values(&aux, &wanted, &mut dst).unwrap();
assert_eq!(found, 2);
assert_eq!(dst.get(*b"NM").unwrap().as_int(), Some(3));
assert_eq!(dst.get(*b"AS").unwrap().as_int(), Some(100));
}
#[test]
fn scan_presence_only_skips_value_decode_for_string_tag() {
let aux = encode_aux(&[(*b"SA", "Z1,12345,+,40M,30,2;"), (*b"NM", "i7")]);
let presence: BTreeSet<TagKey> = [*b"SA"].into_iter().collect();
let values: BTreeSet<TagKey> = [*b"NM"].into_iter().collect();
let mut dst = AuxTagStore::new();
let mut present: SmallVec<[TagKey; 4]> = SmallVec::new();
let found = scan_specific(&aux, &values, &presence, &mut dst, &mut present).unwrap();
assert_eq!(found, 2);
assert!(present.contains(b"SA"));
assert!(dst.get(*b"SA").is_none());
assert_eq!(dst.get(*b"NM").unwrap().as_int(), Some(7));
}
#[test]
fn all_integer_widths_collapse_into_int() {
let mut aux = Vec::new();
aux.extend_from_slice(b"T1c");
aux.push(0xFFu8); aux.extend_from_slice(b"T2C");
aux.push(200u8);
aux.extend_from_slice(b"T3s");
aux.extend_from_slice(&(-500i16).to_le_bytes());
aux.extend_from_slice(b"T4S");
aux.extend_from_slice(&40000u16.to_le_bytes());
aux.extend_from_slice(b"T5i");
aux.extend_from_slice(&(-1_000_000i32).to_le_bytes());
aux.extend_from_slice(b"T6I");
aux.extend_from_slice(&3_000_000_000u32.to_le_bytes());
let wanted: BTreeSet<TagKey> =
[*b"T1", *b"T2", *b"T3", *b"T4", *b"T5", *b"T6"].into_iter().collect();
let mut dst = AuxTagStore::new();
scan_values(&aux, &wanted, &mut dst).unwrap();
assert_eq!(dst.get(*b"T1").unwrap().as_int(), Some(-1));
assert_eq!(dst.get(*b"T2").unwrap().as_int(), Some(200));
assert_eq!(dst.get(*b"T3").unwrap().as_int(), Some(-500));
assert_eq!(dst.get(*b"T4").unwrap().as_int(), Some(40000));
assert_eq!(dst.get(*b"T5").unwrap().as_int(), Some(-1_000_000));
assert_eq!(dst.get(*b"T6").unwrap().as_int(), Some(3_000_000_000));
}
#[test]
fn string_and_char_and_float_values() {
let aux = encode_aux(&[(*b"MD", "Z50ACGT"), (*b"XC", "AN"), (*b"XF", "f0.75")]);
let wanted: BTreeSet<TagKey> = [*b"MD", *b"XC", *b"XF"].into_iter().collect();
let mut dst = AuxTagStore::new();
scan_values(&aux, &wanted, &mut dst).unwrap();
assert_eq!(dst.get(*b"MD").and_then(AuxValue::as_str).unwrap(), "50ACGT");
assert_eq!(dst.get(*b"XC").and_then(AuxValue::as_char), Some(b'N'));
let f = dst.get(*b"XF").and_then(AuxValue::as_float).unwrap();
assert!((f - 0.75).abs() < 1e-6);
}
#[test]
fn scan_skips_unsupported_array_type_silently() {
let mut aux = Vec::new();
aux.extend_from_slice(b"BAB");
aux.push(b'i');
aux.extend_from_slice(&3u32.to_le_bytes());
for v in [1i32, 2, 3] {
aux.extend_from_slice(&v.to_le_bytes());
}
aux.extend_from_slice(b"NMi");
aux.extend_from_slice(&5i32.to_le_bytes());
let wanted: BTreeSet<TagKey> = [*b"BA", *b"NM"].into_iter().collect();
let mut dst = AuxTagStore::new();
scan_values(&aux, &wanted, &mut dst).unwrap();
assert!(dst.get(*b"BA").is_none());
assert_eq!(dst.get(*b"NM").unwrap().as_int(), Some(5));
}
#[test]
fn aux_b_array_oversized_payload_rejected_by_scan_bounds() {
let mut aux = Vec::new();
aux.extend_from_slice(b"BAB"); aux.push(b'i'); aux.extend_from_slice(&u32::MAX.to_le_bytes());
let wanted: BTreeSet<TagKey> = [*b"BA"].into_iter().collect();
let mut dst = AuxTagStore::new();
let err = scan_values(&aux, &wanted, &mut dst).unwrap_err();
assert!(
err.to_string().contains("extends past end of buffer"),
"expected past-end error, got: {err}"
);
}
#[test]
fn malformed_aux_surface_as_error() {
let aux = b"XXQabcd".to_vec();
let wanted: BTreeSet<TagKey> = [*b"XX"].into_iter().collect();
let mut dst = AuxTagStore::new();
assert!(scan_values(&aux, &wanted, &mut dst).is_err());
}
#[test]
fn aux_store_insert_replaces_on_same_tag() {
let mut s = AuxTagStore::new();
s.insert(*b"NM", AuxValue::Int(1));
s.insert(*b"NM", AuxValue::Int(2));
assert_eq!(s.len(), 1);
assert_eq!(s.get(*b"NM").unwrap().as_int(), Some(2));
}
#[test]
fn requirements_default_is_none() {
let n = RikerRecordRequirements::default();
assert_eq!(n, RikerRecordRequirements::NONE);
assert!(!n.needs_sequence());
assert!(n.aux_tags().is_none());
}
#[test]
fn requirements_with_sequence_sets_flag() {
let n = RikerRecordRequirements::NONE.with_sequence();
assert!(n.needs_sequence());
assert!(n.aux_tags().is_none());
}
#[test]
fn requirements_with_aux_tag_transitions_none_to_specific() {
let n = RikerRecordRequirements::NONE.with_aux_tag(*b"NM");
match n.aux_tags {
AuxTagRequirements::Specific { values, presence } => {
assert_eq!(values.len(), 1);
assert!(values.contains(b"NM"));
assert!(presence.is_empty());
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_multiple_with_aux_tag_calls_accumulate() {
let n = RikerRecordRequirements::NONE.with_aux_tag(*b"NM").with_aux_tag(*b"MD");
match n.aux_tags {
AuxTagRequirements::Specific { values, presence: _ } => {
assert_eq!(values.len(), 2);
assert!(values.contains(b"NM"));
assert!(values.contains(b"MD"));
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_presence_only_lands_in_presence_set() {
let n = RikerRecordRequirements::NONE.with_aux_tag_presence(*b"SA");
match n.aux_tags {
AuxTagRequirements::Specific { values, presence } => {
assert!(values.is_empty());
assert_eq!(presence.len(), 1);
assert!(presence.contains(b"SA"));
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_value_request_subsumes_later_presence_request() {
let n = RikerRecordRequirements::NONE.with_aux_tag(*b"SA").with_aux_tag_presence(*b"SA");
match n.aux_tags {
AuxTagRequirements::Specific { values, presence } => {
assert!(values.contains(b"SA"));
assert!(presence.is_empty());
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_value_request_evicts_earlier_presence() {
let n = RikerRecordRequirements::NONE.with_aux_tag_presence(*b"SA").with_aux_tag(*b"SA");
match n.aux_tags {
AuxTagRequirements::Specific { values, presence } => {
assert!(values.contains(b"SA"));
assert!(presence.is_empty());
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_union_value_subsumes_presence_across_collectors() {
let a = RikerRecordRequirements::NONE.with_aux_tag(*b"SA");
let b = RikerRecordRequirements::NONE.with_aux_tag_presence(*b"SA");
let u = a.union(b);
match u.aux_tags {
AuxTagRequirements::Specific { values, presence } => {
assert!(values.contains(b"SA"));
assert!(presence.is_empty());
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_with_all_aux_tags_sets_all() {
let n = RikerRecordRequirements::NONE.with_aux_tag(*b"NM").with_all_aux_tags();
assert_eq!(n.aux_tags, AuxTagRequirements::All);
}
#[test]
fn requirements_union_of_none_and_specific_is_specific() {
let a = RikerRecordRequirements::NONE;
let b = RikerRecordRequirements::NONE.with_aux_tag(*b"NM");
assert_eq!(a.union(b.clone()), b);
}
#[test]
fn requirements_union_merges_specific_sets() {
let a = RikerRecordRequirements::NONE.with_aux_tag(*b"NM");
let b = RikerRecordRequirements::NONE.with_aux_tag(*b"MD");
let u = a.union(b);
match u.aux_tags {
AuxTagRequirements::Specific { values, presence: _ } => {
assert!(values.contains(b"NM"));
assert!(values.contains(b"MD"));
}
other => panic!("expected Specific, got {other:?}"),
}
}
#[test]
fn requirements_union_with_all_absorbs_everything() {
let a = RikerRecordRequirements::NONE.with_aux_tag(*b"NM");
let b = RikerRecordRequirements::everything();
assert_eq!(a.clone().union(b.clone()), b);
assert_eq!(b.clone().union(a), b);
}
#[test]
fn requirements_union_is_commutative_on_specific() {
let a = RikerRecordRequirements::NONE.with_aux_tag(*b"NM");
let b = RikerRecordRequirements::NONE.with_aux_tag(*b"MD");
assert_eq!(a.clone().union(b.clone()), b.union(a));
}
#[test]
fn requirements_union_is_idempotent() {
let a = RikerRecordRequirements::NONE.with_sequence().with_aux_tag(*b"NM");
assert_eq!(a.clone().union(a.clone()), a);
}
#[test]
fn requirements_sequence_flag_unions_with_or() {
let a = RikerRecordRequirements::NONE.with_sequence();
let b = RikerRecordRequirements::NONE;
assert!(a.clone().union(b.clone()).needs_sequence());
assert!(b.union(a).needs_sequence());
}
}