use crate::utils::{TraceErr, TraceOk};
use seqair_types::{Pos0, SmallVec};
pub const CIGAR_M: u8 = 0;
pub const CIGAR_I: u8 = 1;
pub const CIGAR_D: u8 = 2;
pub const CIGAR_N: u8 = 3;
pub const CIGAR_S: u8 = 4;
pub const CIGAR_H: u8 = 5;
pub const CIGAR_P: u8 = 6;
pub const CIGAR_EQ: u8 = 7;
pub const CIGAR_X: u8 = 8;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarOpType {
Match,
Insertion,
Deletion,
RefSkip,
SoftClip,
HardClip,
Padding,
SeqMatch,
SeqMismatch,
Unknown(u8),
}
impl CigarOpType {
pub const fn from_bam(code: u8) -> Self {
match code {
CIGAR_M => Self::Match,
CIGAR_I => Self::Insertion,
CIGAR_D => Self::Deletion,
CIGAR_N => Self::RefSkip,
CIGAR_S => Self::SoftClip,
CIGAR_H => Self::HardClip,
CIGAR_P => Self::Padding,
CIGAR_EQ => Self::SeqMatch,
CIGAR_X => Self::SeqMismatch,
other => Self::Unknown(other),
}
}
pub const fn to_bam_code(self) -> u8 {
match self {
Self::Match => CIGAR_M,
Self::Insertion => CIGAR_I,
Self::Deletion => CIGAR_D,
Self::RefSkip => CIGAR_N,
Self::SoftClip => CIGAR_S,
Self::HardClip => CIGAR_H,
Self::Padding => CIGAR_P,
Self::SeqMatch => CIGAR_EQ,
Self::SeqMismatch => CIGAR_X,
Self::Unknown(code) => code,
}
}
pub const fn consumes_ref(self) -> bool {
matches!(
self,
Self::Match | Self::Deletion | Self::RefSkip | Self::SeqMatch | Self::SeqMismatch
)
}
pub const fn consumes_query(self) -> bool {
matches!(
self,
Self::Match | Self::Insertion | Self::SoftClip | Self::SeqMatch | Self::SeqMismatch
)
}
}
#[repr(transparent)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, bytemuck::Pod, bytemuck::Zeroable)]
pub struct CigarOp(u32);
const _: () = assert!(size_of::<CigarOp>() == 4, "CigarOp must stay 4 bytes");
impl CigarOp {
pub const fn new(op: CigarOpType, len: u32) -> Self {
debug_assert!(len < (1 << 28), "CIGAR op length exceeds 28-bit BAM limit");
Self((len << 4) | op.to_bam_code() as u32)
}
pub const fn from_bam_u32(packed: u32) -> Self {
Self(packed)
}
pub const fn to_bam_u32(self) -> u32 {
self.0
}
#[allow(clippy::len_without_is_empty, reason = "len is the CIGAR op span, not a collection")]
pub const fn len(self) -> u32 {
self.0 >> 4
}
pub const fn op_code(self) -> u8 {
(self.0 & 0xF) as u8
}
pub const fn op_type(self) -> CigarOpType {
CigarOpType::from_bam(self.op_code())
}
pub const fn consumes_ref(self) -> bool {
consumes_ref(self.op_code())
}
pub const fn consumes_query(self) -> bool {
consumes_query(self.op_code())
}
#[inline]
pub fn ops_as_bytes(ops: &[Self]) -> &[u8] {
bytemuck::cast_slice(ops)
}
#[inline]
pub fn extend_from_bam_bytes(dst: &mut Vec<Self>, bytes: &[u8]) {
debug_assert!(bytes.len().is_multiple_of(4), "BAM CIGAR bytes must be multiple of 4");
#[cfg(target_endian = "little")]
{
let n_ops = bytes.len() / 4;
let n_bytes = n_ops.checked_mul(4).expect("cigar byte length overflow");
let new_len = dst.len().checked_add(n_ops).expect("cigar slab len overflow");
dst.reserve(n_ops);
unsafe {
let dst_ptr = dst.as_mut_ptr().add(dst.len()).cast::<u8>();
std::ptr::copy_nonoverlapping(bytes.as_ptr(), dst_ptr, n_bytes);
dst.set_len(new_len);
}
}
#[cfg(not(target_endian = "little"))]
{
for chunk in bytes.chunks_exact(4) {
let arr: [u8; 4] = chunk.try_into().expect("chunks_exact(4) yields 4 bytes");
dst.push(CigarOp::from_bam_u32(u32::from_le_bytes(arr)));
}
}
}
}
impl std::fmt::Debug for CigarOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CigarOp").field("op", &self.op_type()).field("len", &self.len()).finish()
}
}
impl std::fmt::Display for CigarOpType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use std::fmt::Write as _;
let c = match self {
Self::Match => 'M',
Self::Insertion => 'I',
Self::Deletion => 'D',
Self::RefSkip => 'N',
Self::SoftClip => 'S',
Self::HardClip => 'H',
Self::Padding => 'P',
Self::SeqMatch => '=',
Self::SeqMismatch => 'X',
Self::Unknown(_) => '?',
};
f.write_char(c)
}
}
impl std::fmt::Display for CigarOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}{}", self.len(), self.op_type())
}
}
pub struct CigarStr<'a>(pub &'a [CigarOp]);
impl std::fmt::Display for CigarStr<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
for op in self.0 {
write!(f, "{op}")?;
}
Ok(())
}
}
const fn consumes_ref(op: u8) -> bool {
matches!(op, CIGAR_M | CIGAR_D | CIGAR_N | CIGAR_EQ | CIGAR_X)
}
const fn consumes_query(op: u8) -> bool {
matches!(op, CIGAR_M | CIGAR_I | CIGAR_S | CIGAR_EQ | CIGAR_X)
}
pub fn calc_query_len(ops: &[CigarOp]) -> u32 {
let mut qlen = 0u32;
for op in ops {
if op.consumes_query() {
qlen = qlen.saturating_add(op.len());
}
}
qlen
}
pub fn calc_matches_indels(ops: &[CigarOp]) -> (u32, u32) {
let mut matches = 0u32;
let mut indels = 0u32;
for op in ops {
match op.op_code() {
CIGAR_M => matches = matches.saturating_add(op.len()),
CIGAR_I | CIGAR_D => indels = indels.saturating_add(op.len()),
_ => {}
}
}
(matches, indels)
}
pub fn compute_end_pos(pos: Pos0, ops: &[CigarOp]) -> Option<Pos0> {
let mut ref_len: i64 = 0;
for op in ops {
if op.consumes_ref() {
ref_len = ref_len.checked_add(i64::from(op.len()))?;
}
}
if ref_len == 0 {
Some(pos)
} else {
pos.checked_add_offset(seqair_types::Offset::new(ref_len.checked_sub(1)?))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarPosInfo {
Match { qpos: u32 },
Insertion { qpos: u32, insert_len: u32 },
Deletion { del_len: u32 },
ComplexIndel { del_len: u32, insert_len: u32, is_refskip: bool },
RefSkip,
}
pub enum CigarMapping {
Linear { rec_pos: Pos0, query_offset: u32, match_len: u32 },
Complex(SmallVec<CompactOp, 6>),
}
impl std::fmt::Debug for CigarMapping {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Linear { rec_pos, query_offset, match_len } => f
.debug_struct("Linear")
.field("rec_pos", &rec_pos)
.field("query_offset", query_offset)
.field("match_len", match_len)
.finish(),
Self::Complex(ops) => write!(f, "Complex({} ops)", ops.len()),
}
}
}
#[derive(Clone, Copy)]
pub struct CompactOp {
ref_start: i32,
query_start: u32,
len: u32,
op_type: u8,
}
impl CigarMapping {
#[inline]
pub fn new(rec_pos: Pos0, ops: &[CigarOp]) -> Option<Self> {
match try_linear(ops) {
Some((query_offset, match_len)) => {
Some(Self::Linear { rec_pos, query_offset, match_len })
}
None => Some(Self::Complex(build_compact_ops(rec_pos, ops)?)),
}
}
#[inline]
pub fn pos_info_at(&self, pos: Pos0) -> Option<CigarPosInfo> {
match self {
Self::Linear { rec_pos, query_offset, match_len } => {
let offset = pos.as_i64().wrapping_sub(rec_pos.as_i64());
if offset < 0 || offset >= i64::from(*match_len) {
return None;
}
Some(CigarPosInfo::Match {
qpos: u32::try_from(offset)
.trace_ok("offset exceeds u32 range")?
.checked_add(*query_offset)
.trace_err("qpos overflow")?,
})
}
Self::Complex(ops) => {
if ops.len() <= 4 {
pos_info_linear(ops, pos)
} else {
pos_info_bsearch(ops, pos)
}
}
}
}
}
#[inline]
fn try_linear(ops: &[CigarOp]) -> Option<(u32, u32)> {
let mut query_offset = 0u32;
let mut match_len = 0u32;
let mut phase = 0u8;
for op in ops {
let len = op.len();
match (phase, op.op_code()) {
(0, CIGAR_S) => query_offset = query_offset.saturating_add(len),
(0, CIGAR_H) => {}
(0, CIGAR_M | CIGAR_EQ | CIGAR_X) => {
match_len = match_len.saturating_add(len);
phase = 1;
}
(1, CIGAR_M | CIGAR_EQ | CIGAR_X) => match_len = match_len.saturating_add(len),
(1 | 2, CIGAR_S | CIGAR_H) => phase = 2,
_ => return None,
}
}
if phase >= 1 { Some((query_offset, match_len)) } else { None }
}
fn build_compact_ops(rec_pos: Pos0, ops: &[CigarOp]) -> Option<SmallVec<CompactOp, 6>> {
let mut out = SmallVec::with_capacity(ops.len());
let mut ref_off: i64 = 0;
let mut query_off: u32 = 0;
for op in ops {
let len = op.len();
let op_type = op.op_code();
let ref_start_i64 = rec_pos.as_i64().wrapping_add(ref_off);
let ref_start = i32::try_from(ref_start_i64).trace_ok("ref_start exceeds i32 range")?;
out.push(CompactOp { ref_start, query_start: query_off, len, op_type });
if consumes_ref(op_type) {
ref_off = ref_off.checked_add(i64::from(len)).trace_err("ref offset overflow")?;
}
if consumes_query(op_type) {
query_off = query_off.saturating_add(len);
}
}
Some(out)
}
#[inline]
fn next_insertion_len(ops: &[CompactOp], op_idx: usize) -> Option<u32> {
let mut total = 0u32;
let mut idx = op_idx.checked_add(1)?;
while let Some(next) = ops.get(idx) {
if next.op_type == CIGAR_P {
idx = idx.checked_add(1)?;
continue;
}
if next.op_type != CIGAR_I {
break;
}
total = total.saturating_add(next.len);
idx = idx.checked_add(1)?;
}
if total > 0 { Some(total) } else { None }
}
#[inline]
fn classify_op(
ops: &[CompactOp],
i: usize,
op: &CompactOp,
pos32: i32,
ref_end: i32,
) -> Option<CigarPosInfo> {
if consumes_query(op.op_type) {
debug_assert!(
matches!(op.op_type, CIGAR_M | CIGAR_EQ | CIGAR_X),
"classify_op reached query-consuming branch with unexpected op type {}",
op.op_type
);
let offset = pos32.wrapping_sub(op.ref_start) as u32;
let qpos = op.query_start.checked_add(offset).trace_err("qpos overflow")?;
if pos32 == ref_end.wrapping_sub(1)
&& let Some(insert_len) = next_insertion_len(ops, i)
{
return Some(CigarPosInfo::Insertion { qpos, insert_len });
}
Some(CigarPosInfo::Match { qpos })
} else if op.op_type == CIGAR_D {
if pos32 == ref_end.wrapping_sub(1)
&& let Some(insert_len) = next_insertion_len(ops, i)
{
return Some(CigarPosInfo::ComplexIndel {
del_len: op.len,
insert_len,
is_refskip: false,
});
}
Some(CigarPosInfo::Deletion { del_len: op.len })
} else if op.op_type == CIGAR_N {
if pos32 == ref_end.wrapping_sub(1)
&& let Some(insert_len) = next_insertion_len(ops, i)
{
return Some(CigarPosInfo::ComplexIndel {
del_len: op.len,
insert_len,
is_refskip: true,
});
}
Some(CigarPosInfo::RefSkip)
} else {
None
}
}
#[inline]
fn pos_info_linear(ops: &[CompactOp], pos: Pos0) -> Option<CigarPosInfo> {
let pos32 = pos.as_i32();
for (i, op) in ops.iter().enumerate() {
if !consumes_ref(op.op_type) {
continue;
}
let ref_end =
op.ref_start.saturating_add(i32::try_from(op.len).trace_ok("op len exceeds i32")?);
if pos32 < op.ref_start || pos32 >= ref_end {
continue;
}
return classify_op(ops, i, op, pos32, ref_end);
}
None
}
#[inline]
fn pos_info_bsearch(ops: &[CompactOp], pos: Pos0) -> Option<CigarPosInfo> {
let pos32 = pos.as_i32();
let idx = ops.partition_point(|op| op.ref_start <= pos32);
if idx == 0 {
return None;
}
for i in idx.saturating_sub(2)..ops.len().min(idx.saturating_add(1)) {
let Some(op) = ops.get(i) else { continue };
if !consumes_ref(op.op_type) {
continue;
}
let ref_end =
op.ref_start.saturating_add(i32::try_from(op.len).trace_ok("op len exceeds i32")?);
if pos32 >= op.ref_start && pos32 < ref_end {
return classify_op(ops, i, op, pos32, ref_end);
}
}
None
}
#[cfg(test)]
#[allow(clippy::arithmetic_side_effects, reason = "test arithmetic on known small values")]
mod tests {
use super::*;
fn op(op_type: CigarOpType, len: u32) -> CigarOp {
CigarOp::new(op_type, len)
}
#[test]
fn cigar_op_roundtrip_all_ops() {
let ops = [
(CigarOpType::Match, CIGAR_M),
(CigarOpType::Insertion, CIGAR_I),
(CigarOpType::Deletion, CIGAR_D),
(CigarOpType::RefSkip, CIGAR_N),
(CigarOpType::SoftClip, CIGAR_S),
(CigarOpType::HardClip, CIGAR_H),
(CigarOpType::Padding, CIGAR_P),
(CigarOpType::SeqMatch, CIGAR_EQ),
(CigarOpType::SeqMismatch, CIGAR_X),
];
for (op_type, code) in ops {
assert_eq!(op_type.to_bam_code(), code, "to_bam_code failed for {op_type:?}");
assert_eq!(CigarOpType::from_bam(code), op_type, "from_bam failed for code {code}");
let cigar_op = CigarOp::new(op_type, 42);
let packed = cigar_op.to_bam_u32();
let decoded = CigarOp::from_bam_u32(packed);
assert_eq!(decoded, cigar_op, "round-trip failed for {op_type:?}");
}
}
#[test]
fn cigar_op_max_length() {
let max_len = (1u32 << 28) - 1;
let op = CigarOp::new(CigarOpType::Match, max_len);
let packed = op.to_bam_u32();
let decoded = CigarOp::from_bam_u32(packed);
assert_eq!(decoded.len(), max_len);
assert_eq!(decoded.op_type(), CigarOpType::Match);
}
#[test]
fn cigar_op_unknown_code_round_trips() {
let packed = (100u32 << 4) | 9;
let op = CigarOp::from_bam_u32(packed);
assert_eq!(op.op_type(), CigarOpType::Unknown(9));
assert_eq!(op.len(), 100);
assert_eq!(op.to_bam_u32(), packed);
assert!(!op.consumes_ref());
assert!(!op.consumes_query());
}
#[test]
fn compact_op_ref_start_fits_i32_for_bam_positions() {
let cigar = [op(CigarOpType::Match, 100)];
let rec_pos = Pos0::new((i32::MAX as u32) - 100).unwrap(); let mapping = CigarMapping::new(rec_pos, &cigar).unwrap();
assert!(matches!(mapping, CigarMapping::Linear { .. }));
assert_eq!(mapping.pos_info_at(rec_pos), Some(CigarPosInfo::Match { qpos: 0 }));
}
#[test]
fn cigar_op_type_display_matches_sam_spec() {
assert_eq!(CigarOpType::Match.to_string(), "M");
assert_eq!(CigarOpType::Insertion.to_string(), "I");
assert_eq!(CigarOpType::Deletion.to_string(), "D");
assert_eq!(CigarOpType::RefSkip.to_string(), "N");
assert_eq!(CigarOpType::SoftClip.to_string(), "S");
assert_eq!(CigarOpType::HardClip.to_string(), "H");
assert_eq!(CigarOpType::Padding.to_string(), "P");
assert_eq!(CigarOpType::SeqMatch.to_string(), "=");
assert_eq!(CigarOpType::SeqMismatch.to_string(), "X");
assert_eq!(CigarOpType::Unknown(9).to_string(), "?");
}
#[test]
fn cigar_op_display_concatenates_len_and_op() {
assert_eq!(op(CigarOpType::Match, 10).to_string(), "10M");
assert_eq!(op(CigarOpType::Insertion, 2).to_string(), "2I");
assert_eq!(op(CigarOpType::SeqMatch, 4).to_string(), "4=");
}
#[test]
fn cigar_str_renders_full_cigar() {
let ops = [
op(CigarOpType::Match, 10),
op(CigarOpType::Insertion, 2),
op(CigarOpType::Deletion, 3),
];
assert_eq!(CigarStr(&ops).to_string(), "10M2I3D");
let clipped = [
op(CigarOpType::SoftClip, 5),
op(CigarOpType::Match, 50),
op(CigarOpType::SoftClip, 3),
];
assert_eq!(CigarStr(&clipped).to_string(), "5S50M3S");
assert_eq!(CigarStr(&[]).to_string(), "");
}
#[test]
fn linear_cigar_mapping_bounds_check() {
let cigar = [op(CigarOpType::SoftClip, 5), op(CigarOpType::Match, 100)];
let mapping = CigarMapping::new(Pos0::new(1000).unwrap(), &cigar).unwrap();
assert!(matches!(mapping, CigarMapping::Linear { .. }));
assert_eq!(
mapping.pos_info_at(Pos0::new(1000).unwrap()),
Some(CigarPosInfo::Match { qpos: 5 })
);
assert_eq!(
mapping.pos_info_at(Pos0::new(1050).unwrap()),
Some(CigarPosInfo::Match { qpos: 55 })
);
assert_eq!(
mapping.pos_info_at(Pos0::new(1099).unwrap()),
Some(CigarPosInfo::Match { qpos: 104 })
);
assert_eq!(
mapping.pos_info_at(Pos0::new(999).unwrap()),
None,
"pos before rec_pos must return None"
);
assert_eq!(
mapping.pos_info_at(Pos0::new(1100).unwrap()),
None,
"pos at rec_pos + match_len must return None"
);
assert_eq!(
mapping.pos_info_at(Pos0::new(1200).unwrap()),
None,
"pos past alignment end must return None"
);
assert_eq!(
mapping.pos_info_at(Pos0::new(0).unwrap()),
None,
"pos far before alignment must return None"
);
}
}