use std::collections::HashMap;
use std::io::{self, BufRead, ErrorKind};
use anyhow::{Context, Result, anyhow, bail};
use fgumi_raw_bam::RawRecord;
use memchr::{memchr, memchr_iter};
use noodles_sam::Header;
use wide::u8x16;
const READ_CHUNK: usize = 256 * 1024;
static ENCODE_BASE: [u8; 256] = build_encode_table();
const fn build_encode_table() -> [u8; 256] {
let mut t = [15u8; 256];
t[b'=' as usize] = 0;
t[b'A' as usize] = 1;
t[b'C' as usize] = 2;
t[b'M' as usize] = 3;
t[b'G' as usize] = 4;
t[b'R' as usize] = 5;
t[b'S' as usize] = 6;
t[b'V' as usize] = 7;
t[b'T' as usize] = 8;
t[b'W' as usize] = 9;
t[b'Y' as usize] = 10;
t[b'H' as usize] = 11;
t[b'K' as usize] = 12;
t[b'D' as usize] = 13;
t[b'B' as usize] = 14;
t[b'N' as usize] = 15;
t[b'a' as usize] = 1;
t[b'c' as usize] = 2;
t[b'g' as usize] = 4;
t[b't' as usize] = 8;
t[b'n' as usize] = 15;
t
}
pub(crate) struct SamReader<R: BufRead> {
inner: R,
buf: Vec<u8>,
pos: usize,
tid_by_name: HashMap<Vec<u8>, i32>,
eof: bool,
cigar_scratch: Vec<u32>,
}
impl<R: BufRead> SamReader<R> {
pub(crate) fn new(reader: R) -> Self {
Self {
inner: reader,
buf: Vec::with_capacity(READ_CHUNK),
pos: 0,
tid_by_name: HashMap::new(),
eof: false,
cigar_scratch: Vec::with_capacity(32),
}
}
pub(crate) fn read_header(&mut self) -> Result<Header> {
let mut header_text: Vec<u8> = Vec::new();
loop {
self.ensure_some_bytes_or_eof().context("reading SAM header")?;
if self.pos == self.buf.len() {
break; }
if self.buf[self.pos] != b'@' {
break; }
let nl = self.find_newline_or_eof().context("reading SAM header line")?;
header_text.extend_from_slice(&self.buf[self.pos..nl]);
header_text.push(b'\n');
self.pos = (nl + 1).min(self.buf.len());
}
let header: Header = if header_text.is_empty() {
Header::default()
} else {
std::str::from_utf8(&header_text)
.context("SAM header is not valid UTF-8")?
.parse()
.context("parsing SAM header")?
};
for (idx, (name, _)) in header.reference_sequences().iter().enumerate() {
self.tid_by_name.insert(name.to_vec(), idx as i32);
}
Ok(header)
}
pub(crate) fn read_record(&mut self, rec: &mut RawRecord) -> io::Result<bool> {
loop {
while self.pos < self.buf.len() && self.buf[self.pos] == b'\n' {
self.pos += 1;
}
if self.pos >= self.buf.len() && !self.try_refill()? {
return Ok(false);
}
if let Some(off) = memchr(b'\n', &self.buf[self.pos..]) {
let line_start = self.pos;
let mut line_end = self.pos + off;
self.pos = line_end + 1;
if line_end > line_start && self.buf[line_end - 1] == b'\r' {
line_end -= 1;
}
if line_end == line_start {
continue; }
let SamReader { buf, tid_by_name, cigar_scratch, .. } = self;
parse_sam_line(&buf[line_start..line_end], rec, tid_by_name, cigar_scratch)
.map_err(|e| {
io::Error::new(ErrorKind::InvalidData, format!("parsing SAM record: {e:#}"))
})?;
return Ok(true);
}
if !self.try_refill()? {
if self.buf.len() > self.pos {
let line_start = self.pos;
let line_end = self.buf.len();
self.pos = line_end;
let SamReader { buf, tid_by_name, cigar_scratch, .. } = self;
parse_sam_line(&buf[line_start..line_end], rec, tid_by_name, cigar_scratch)
.map_err(|e| {
io::Error::new(
ErrorKind::InvalidData,
format!("parsing SAM record: {e:#}"),
)
})?;
return Ok(true);
}
return Ok(false);
}
}
}
fn ensure_some_bytes_or_eof(&mut self) -> io::Result<()> {
if self.pos >= self.buf.len() {
let _ = self.try_refill()?;
}
Ok(())
}
fn find_newline_or_eof(&mut self) -> io::Result<usize> {
loop {
if let Some(off) = memchr(b'\n', &self.buf[self.pos..]) {
return Ok(self.pos + off);
}
if !self.try_refill()? {
return Ok(self.buf.len());
}
}
}
fn try_refill(&mut self) -> io::Result<bool> {
if self.eof {
return Ok(false);
}
if self.pos > 0 {
let remaining = self.buf.len() - self.pos;
if remaining > 0 {
self.buf.copy_within(self.pos.., 0);
}
self.buf.truncate(remaining);
self.pos = 0;
}
let n = read_into_spare(&mut self.inner, &mut self.buf, READ_CHUNK)?;
if n == 0 {
self.eof = true;
return Ok(false);
}
Ok(true)
}
}
fn read_into_spare<R: std::io::Read>(
src: &mut R,
buf: &mut Vec<u8>,
cap: usize,
) -> io::Result<usize> {
use std::mem::MaybeUninit;
buf.reserve(cap);
let spare: &mut [MaybeUninit<u8>] = &mut buf.spare_capacity_mut()[..cap];
let target: &mut [u8] =
unsafe { std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, spare.len()) };
let n = src.read(target)?;
unsafe { buf.set_len(buf.len() + n) };
Ok(n)
}
fn parse_sam_line(
line: &[u8],
rec: &mut RawRecord,
tid_by_name: &HashMap<Vec<u8>, i32>,
cigar_scratch: &mut Vec<u32>,
) -> Result<()> {
let mut tabs: [usize; 11] = [0; 11];
let mut n_tabs = 0usize;
for off in memchr_iter(b'\t', line) {
if n_tabs < 11 {
tabs[n_tabs] = off;
n_tabs += 1;
} else {
break;
}
}
if n_tabs < 10 {
bail!("SAM line has only {} fields (needs at least 11)", n_tabs + 1);
}
let field = |i: usize| -> &[u8] {
let start = if i == 0 { 0 } else { tabs[i - 1] + 1 };
let end = if i < n_tabs { tabs[i] } else { line.len() };
&line[start..end]
};
let aux_start = if n_tabs >= 11 { tabs[10] + 1 } else { line.len() };
let qname = field(0);
let flag = u16::try_from(parse_u32_ascii(field(1))?)
.map_err(|_| anyhow!("FLAG field exceeds u16 range"))?;
let rname = field(2);
let pos_1based = parse_i32_ascii(field(3))?;
let mapq = u8::try_from(parse_u32_ascii(field(4))?)
.map_err(|_| anyhow!("MAPQ field exceeds u8 range"))?;
let cigar_text = field(5);
let rnext = field(6);
let pnext_1based = parse_i32_ascii(field(7))?;
let tlen = parse_i32_ascii(field(8))?;
let seq = field(9);
let qual = field(10);
let tid = name_to_tid(rname, tid_by_name)?;
let mtid = if rnext == b"=" { tid } else { name_to_tid(rnext, tid_by_name)? };
let pos_0based = if pos_1based > 0 { pos_1based - 1 } else { -1 };
let mpos_0based = if pnext_1based > 0 { pnext_1based - 1 } else { -1 };
cigar_scratch.clear();
if cigar_text != b"*" && !cigar_text.is_empty() {
parse_cigar_ops(cigar_text, cigar_scratch)?;
}
let n_cigar = cigar_scratch.len();
if n_cigar > u16::MAX as usize {
bail!("CIGAR op count {} exceeds u16::MAX", n_cigar);
}
let l_seq = if seq == b"*" { 0usize } else { seq.len() };
let qual_present = qual != b"*";
if qual_present && l_seq > 0 && qual.len() != l_seq {
bail!("QUAL length {} != SEQ length {}", qual.len(), l_seq);
}
let bin = if tid < 0 {
4680u16
} else {
let beg = pos_0based.max(0) as i64;
let ra_len = ref_length_from_cigar(cigar_scratch);
let end = beg + ra_len.max(1) as i64;
reg2bin(beg, end) as u16
};
let l_qname = qname.len() + 1;
if l_qname > 255 {
bail!("qname length {} exceeds u8::MAX", l_qname);
}
let vec = rec.as_mut_vec();
vec.clear();
vec.resize(32, 0);
vec.extend_from_slice(qname);
vec.push(0);
for &op in cigar_scratch.iter() {
vec.extend_from_slice(&op.to_le_bytes());
}
if l_seq > 0 {
pack_seq_into(seq, vec);
}
if l_seq > 0 {
if qual_present {
decode_qual_into(qual, vec);
} else {
vec.extend(std::iter::repeat_n(0xFFu8, l_seq));
}
}
if aux_start < line.len() {
parse_aux_tags(&line[aux_start..], vec)?;
}
write_bam_header(
&mut vec[..32],
tid,
pos_0based,
l_qname as u8,
mapq,
bin,
n_cigar as u16,
flag,
l_seq as u32,
mtid,
mpos_0based,
tlen,
);
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn write_bam_header(
dst: &mut [u8],
tid: i32,
pos: i32,
l_qname: u8,
mapq: u8,
bin: u16,
n_cigar: u16,
flag: u16,
l_seq: u32,
mtid: i32,
mpos: i32,
tlen: i32,
) {
dst[0..4].copy_from_slice(&tid.to_le_bytes());
dst[4..8].copy_from_slice(&pos.to_le_bytes());
dst[8] = l_qname;
dst[9] = mapq;
dst[10..12].copy_from_slice(&bin.to_le_bytes());
dst[12..14].copy_from_slice(&n_cigar.to_le_bytes());
dst[14..16].copy_from_slice(&flag.to_le_bytes());
dst[16..20].copy_from_slice(&l_seq.to_le_bytes());
dst[20..24].copy_from_slice(&mtid.to_le_bytes());
dst[24..28].copy_from_slice(&mpos.to_le_bytes());
dst[28..32].copy_from_slice(&tlen.to_le_bytes());
}
fn name_to_tid(name: &[u8], tid_by_name: &HashMap<Vec<u8>, i32>) -> Result<i32> {
if name == b"*" {
return Ok(-1);
}
tid_by_name
.get(name)
.copied()
.ok_or_else(|| anyhow!("RNAME/RNEXT '{}' not in header", String::from_utf8_lossy(name)))
}
fn parse_u32_ascii(bytes: &[u8]) -> Result<u32> {
if bytes.is_empty() {
bail!("empty integer field");
}
let mut n = 0u32;
for &b in bytes {
if !b.is_ascii_digit() {
bail!("invalid digit in integer: 0x{:02x}", b);
}
n = n
.checked_mul(10)
.and_then(|x| x.checked_add((b - b'0') as u32))
.ok_or_else(|| anyhow!("integer overflow parsing u32 from {:?}", bstr_lossy(bytes)))?;
}
Ok(n)
}
fn parse_i32_ascii(bytes: &[u8]) -> Result<i32> {
if bytes.is_empty() {
bail!("empty integer field");
}
let (negative, digits) = match bytes[0] {
b'-' => (true, &bytes[1..]),
b'+' => (false, &bytes[1..]),
_ => (false, bytes),
};
if digits.is_empty() {
bail!("integer field has only a sign");
}
let mut n: i32 = 0;
for &b in digits {
if !b.is_ascii_digit() {
bail!("invalid digit in integer: 0x{:02x}", b);
}
let digit = (b - b'0') as i32;
n = n
.checked_mul(10)
.and_then(|x| x.checked_sub(digit))
.ok_or_else(|| anyhow!("integer overflow parsing i32 from {:?}", bstr_lossy(bytes)))?;
}
if negative {
Ok(n)
} else {
n.checked_neg()
.ok_or_else(|| anyhow!("integer overflow parsing i32 from {:?}", bstr_lossy(bytes)))
}
}
fn bstr_lossy(bytes: &[u8]) -> String {
String::from_utf8_lossy(bytes).into_owned()
}
fn parse_cigar_ops(text: &[u8], out: &mut Vec<u32>) -> Result<()> {
let mut len: u32 = 0;
let mut any_digits = false;
for &b in text {
if b.is_ascii_digit() {
len = len
.checked_mul(10)
.and_then(|x| x.checked_add((b - b'0') as u32))
.ok_or_else(|| anyhow!("CIGAR op length overflows u32"))?;
any_digits = true;
} else {
if !any_digits {
bail!("CIGAR op without preceding length");
}
if len == 0 {
bail!("CIGAR op with zero length is invalid per SAM spec");
}
if len > (u32::MAX >> 4) {
bail!("CIGAR op length {len} exceeds 28-bit BAM limit");
}
let op_code = match b {
b'M' => 0u32,
b'I' => 1,
b'D' => 2,
b'N' => 3,
b'S' => 4,
b'H' => 5,
b'P' => 6,
b'=' => 7,
b'X' => 8,
_ => bail!("invalid CIGAR op code: 0x{:02x}", b),
};
out.push((len << 4) | op_code);
len = 0;
any_digits = false;
}
}
if any_digits {
bail!("CIGAR has trailing length without op");
}
Ok(())
}
fn pack_seq_into(seq: &[u8], out: &mut Vec<u8>) {
let n = seq.len();
out.reserve(n.div_ceil(2));
let mut i = 0;
while i + 16 <= n {
let bases_arr: [u8; 16] = seq[i..i + 16].try_into().unwrap();
let bases = u8x16::new(bases_arr);
if let Some(packed8) = simd_pack_chunk(bases) {
out.extend_from_slice(&packed8);
} else {
for j in (0..16).step_by(2) {
let hi = ENCODE_BASE[seq[i + j] as usize];
let lo = ENCODE_BASE[seq[i + j + 1] as usize];
out.push((hi << 4) | lo);
}
}
i += 16;
}
while i + 2 <= n {
let hi = ENCODE_BASE[seq[i] as usize];
let lo = ENCODE_BASE[seq[i + 1] as usize];
out.push((hi << 4) | lo);
i += 2;
}
if i < n {
let hi = ENCODE_BASE[seq[i] as usize];
out.push(hi << 4);
}
}
#[inline]
fn simd_pack_chunk(bases: u8x16) -> Option<[u8; 8]> {
let upper = bases & u8x16::splat(0xDF);
let is_a = upper.simd_eq(u8x16::splat(b'A'));
let is_c = upper.simd_eq(u8x16::splat(b'C'));
let is_g = upper.simd_eq(u8x16::splat(b'G'));
let is_t = upper.simd_eq(u8x16::splat(b'T'));
let is_n = upper.simd_eq(u8x16::splat(b'N'));
let is_eq = bases.simd_eq(u8x16::splat(b'='));
let valid = is_a | is_c | is_g | is_t | is_n | is_eq;
if valid.to_bitmask() & 0xFFFF != 0xFFFF {
return None;
}
const BAM_LUT: u8x16 = u8x16::new([
15, 1, 15, 2, 8, 15, 15, 4, 15, 15, 15, 15, 15, 0, 15, 15, ]);
let low4 = bases & u8x16::splat(0x0F);
let nibbles = BAM_LUT.swizzle_relaxed(low4);
const EVENS_IDX: u8x16 = u8x16::new([0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14]);
const ODDS_IDX: u8x16 = u8x16::new([1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15]);
const SHIFT4_LUT: u8x16 =
u8x16::new([0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]);
let evens = nibbles.swizzle_relaxed(EVENS_IDX);
let odds = nibbles.swizzle_relaxed(ODDS_IDX);
let evens_shifted = SHIFT4_LUT.swizzle_relaxed(evens);
let packed = evens_shifted | odds;
let arr = packed.to_array();
let mut result = [0u8; 8];
result.copy_from_slice(&arr[..8]);
Some(result)
}
#[inline]
fn decode_qual_into(qual: &[u8], out: &mut Vec<u8>) {
let n = qual.len();
let start = out.len();
out.resize(start + n, 0);
let dst = &mut out[start..];
let bias = u8x16::splat(33);
let mut i = 0;
while i + 16 <= n {
let arr: [u8; 16] = qual[i..i + 16].try_into().unwrap();
let bytes = u8x16::new(arr);
let result = bytes - bias; dst[i..i + 16].copy_from_slice(&result.to_array());
i += 16;
}
while i < n {
dst[i] = qual[i].wrapping_sub(33);
i += 1;
}
}
fn ref_length_from_cigar(ops: &[u32]) -> i32 {
let mut len = 0i32;
for &op in ops {
let l = (op >> 4) as i32;
match op & 0xF {
0 | 2 | 3 | 7 | 8 => len = len.saturating_add(l),
_ => {}
}
}
len
}
fn reg2bin(beg: i64, end: i64) -> u32 {
const LEVEL_OFFSETS: [u32; 5] = [4681, 585, 73, 9, 1];
const LEVEL_SHIFTS: [u32; 5] = [14, 17, 20, 23, 26];
let end = end - 1;
for (offset, shift) in LEVEL_OFFSETS.iter().zip(LEVEL_SHIFTS.iter()) {
if beg >> shift == end >> shift {
return offset.wrapping_add((beg >> shift) as u32);
}
}
0
}
fn parse_aux_tags(text: &[u8], out: &mut Vec<u8>) -> Result<()> {
for tag in text.split(|&b| b == b'\t') {
if tag.is_empty() {
continue;
}
if tag.len() < 5 || tag[2] != b':' || tag[4] != b':' {
bail!("malformed aux tag: '{}'", String::from_utf8_lossy(tag));
}
let key = &tag[0..2];
let ty = tag[3];
let val = &tag[5..];
out.extend_from_slice(key);
match ty {
b'A' => {
if val.len() != 1 {
bail!("A-type aux value must be exactly one character");
}
out.push(b'A');
out.push(val[0]);
}
b'i' => {
let v = parse_i32_ascii(val)?;
emit_int_tag_smallest(v, out);
}
b'f' => {
let f: f32 = std::str::from_utf8(val)
.context("aux f value not UTF-8")?
.parse()
.context("parsing f aux value")?;
out.push(b'f');
out.extend_from_slice(&f.to_le_bytes());
}
b'Z' => {
out.push(b'Z');
out.extend_from_slice(val);
out.push(0);
}
b'H' => {
out.push(b'H');
out.extend_from_slice(val);
out.push(0);
}
b'B' => emit_b_array_tag(val, out)?,
_ => bail!("unknown aux type code: 0x{:02x}", ty),
}
}
Ok(())
}
fn emit_int_tag_smallest(v: i32, out: &mut Vec<u8>) {
if v >= 0 {
if v <= u8::MAX as i32 {
out.push(b'C');
out.push(v as u8);
} else if v <= u16::MAX as i32 {
out.push(b'S');
out.extend_from_slice(&(v as u16).to_le_bytes());
} else {
out.push(b'I');
out.extend_from_slice(&(v as u32).to_le_bytes());
}
} else if v >= i8::MIN as i32 {
out.push(b'c');
out.push((v as i8) as u8);
} else if v >= i16::MIN as i32 {
out.push(b's');
out.extend_from_slice(&(v as i16).to_le_bytes());
} else {
out.push(b'i');
out.extend_from_slice(&v.to_le_bytes());
}
}
fn emit_b_array_tag(val: &[u8], out: &mut Vec<u8>) -> Result<()> {
if val.is_empty() {
bail!("empty B-type aux value");
}
let subtype = val[0];
if !matches!(subtype, b'c' | b'C' | b's' | b'S' | b'i' | b'I' | b'f') {
bail!("invalid B-array subtype: 0x{:02x}", subtype);
}
let rest = if val.len() > 1 && val[1] == b',' { &val[2..] } else { &val[1..] };
out.push(b'B');
out.push(subtype);
let count = if rest.is_empty() {
0u32
} else {
u32::try_from(rest.iter().filter(|&&b| b == b',').count() + 1)
.map_err(|_| anyhow!("B array element count exceeds u32"))?
};
out.extend_from_slice(&count.to_le_bytes());
if count == 0 {
return Ok(());
}
for elem in rest.split(|&b| b == b',') {
match subtype {
b'c' => {
let v = i8::try_from(parse_i32_ascii(elem)?)
.map_err(|_| anyhow!("B:c element {:?} out of i8 range", bstr_lossy(elem)))?;
out.push(v as u8);
}
b'C' => {
let v = u8::try_from(parse_u32_ascii(elem)?)
.map_err(|_| anyhow!("B:C element {:?} out of u8 range", bstr_lossy(elem)))?;
out.push(v);
}
b's' => {
let v = i16::try_from(parse_i32_ascii(elem)?)
.map_err(|_| anyhow!("B:s element {:?} out of i16 range", bstr_lossy(elem)))?;
out.extend_from_slice(&v.to_le_bytes());
}
b'S' => {
let v = u16::try_from(parse_u32_ascii(elem)?)
.map_err(|_| anyhow!("B:S element {:?} out of u16 range", bstr_lossy(elem)))?;
out.extend_from_slice(&v.to_le_bytes());
}
b'i' => out.extend_from_slice(&parse_i32_ascii(elem)?.to_le_bytes()),
b'I' => out.extend_from_slice(&parse_u32_ascii(elem)?.to_le_bytes()),
b'f' => {
let f: f32 = std::str::from_utf8(elem)
.context("B-f element not UTF-8")?
.parse()
.context("parsing B-f element")?;
out.extend_from_slice(&f.to_le_bytes());
}
_ => bail!("unknown B subtype: 0x{:02x}", subtype),
}
}
Ok(())
}
#[cfg(test)]
mod parser_tests {
use super::*;
#[test]
fn parse_u32_accepts_zero() {
assert_eq!(parse_u32_ascii(b"0").unwrap(), 0);
}
#[test]
fn parse_u32_rejects_overflow() {
assert!(parse_u32_ascii(b"4294967296").is_err());
assert!(parse_u32_ascii(b"9999999999").is_err());
}
#[test]
fn parse_u32_rejects_non_digits() {
assert!(parse_u32_ascii(b"12a3").is_err());
assert!(parse_u32_ascii(b"").is_err());
}
#[test]
fn parse_i32_handles_min_value() {
assert_eq!(parse_i32_ascii(b"-2147483648").unwrap(), i32::MIN);
}
#[test]
fn parse_i32_rejects_overflow_either_direction() {
assert!(parse_i32_ascii(b"2147483648").is_err());
assert!(parse_i32_ascii(b"-2147483649").is_err());
}
#[test]
fn parse_cigar_rejects_zero_length_op() {
let mut out = Vec::new();
assert!(parse_cigar_ops(b"50M0I50M", &mut out).is_err());
}
#[test]
fn parse_cigar_rejects_28bit_overflow() {
let mut out = Vec::new();
assert!(parse_cigar_ops(b"268435456M", &mut out).is_err());
}
#[test]
fn parse_cigar_accepts_max_28bit_length() {
let mut out = Vec::new();
let max = u32::MAX >> 4;
let s = format!("{max}M");
assert!(parse_cigar_ops(s.as_bytes(), &mut out).is_ok());
assert_eq!(out.len(), 1);
assert_eq!(out[0] >> 4, max);
assert_eq!(out[0] & 0xf, 0); }
#[test]
fn b_array_tag_rejects_unknown_subtype() {
let mut out = Vec::new();
assert!(emit_b_array_tag(b"x,1,2,3", &mut out).is_err());
}
#[test]
fn b_array_tag_rejects_out_of_range_element() {
let mut out = Vec::new();
assert!(emit_b_array_tag(b"c,1,2,999", &mut out).is_err());
out.clear();
assert!(emit_b_array_tag(b"C,1,2,-1", &mut out).is_err());
}
#[test]
fn b_array_tag_accepts_valid_inputs() {
let mut out = Vec::new();
emit_b_array_tag(b"i,-1,2,300", &mut out).unwrap();
assert_eq!(out[0..2], *b"Bi");
assert_eq!(u32::from_le_bytes(out[2..6].try_into().unwrap()), 3);
}
}