use rustc_hash::{FxBuildHasher, FxHashMap};
use crate::sam::riker_record::RikerRecord;
pub trait MateCache: Sized {
fn from_record(record: &RikerRecord) -> Self;
}
#[derive(Debug)]
pub enum MateAction<T> {
Alone,
Buffered,
PairWith(T),
}
#[derive(Debug)]
pub enum Peek<T> {
Alone,
WouldBuffer { overlap_start: u32, overlap_len: u32 },
PairWith(T),
}
impl MateCache for RikerRecord {
fn from_record(record: &RikerRecord) -> Self {
record.clone()
}
}
pub struct MateBuffer<T> {
buffer: FxHashMap<Vec<u8>, Entry<T>>,
}
struct Entry<T> {
value: T,
mate_ref_id: usize,
mate_pos: u32,
}
impl<T> MateBuffer<T> {
const INITIAL_CAPACITY: usize = 4096;
#[must_use]
pub fn new() -> Self {
Self { buffer: FxHashMap::with_capacity_and_hasher(Self::INITIAL_CAPACITY, FxBuildHasher) }
}
#[allow(clippy::cast_possible_truncation, reason = "1-based genomic positions fit in u32")]
pub fn probe(&mut self, record: &RikerRecord) -> Peek<T> {
let flags = record.flags();
if !flags.is_segmented() || flags.is_unmapped() || flags.is_mate_unmapped() {
return Peek::Alone;
}
let Some(ref_id) = record.reference_sequence_id() else { return Peek::Alone };
if Some(ref_id) != record.mate_reference_sequence_id() {
return Peek::Alone;
}
let Some(name) = record.name() else { return Peek::Alone };
let name_bytes: &[u8] = name;
if let Some(entry) = self.buffer.remove(name_bytes) {
return Peek::PairWith(entry.value);
}
let (Some(read_start_pos), Some(read_end_pos), Some(mate_start_pos)) =
(record.alignment_start(), record.alignment_end(), record.mate_alignment_start())
else {
return Peek::Alone;
};
let read_start = read_start_pos.get();
let read_end = read_end_pos.get();
let mate_start_1based = mate_start_pos.get();
if mate_start_1based >= read_start && mate_start_1based <= read_end {
let overlap_start = mate_start_1based.saturating_sub(1) as u32;
let overlap_len = (read_end + 1 - mate_start_1based) as u32;
return Peek::WouldBuffer { overlap_start, overlap_len };
}
Peek::Alone
}
#[allow(clippy::cast_possible_truncation, reason = "1-based genomic positions fit in u32")]
pub fn insert(&mut self, record: &RikerRecord, value: T) {
debug_assert!(record.reference_sequence_id().is_some() && record.name().is_some());
let Some(ref_id) = record.reference_sequence_id() else { return };
let Some(name) = record.name() else { return };
let mate_pos_1based = record.mate_alignment_start().map_or(0, |p| p.get());
let entry = Entry {
value,
mate_ref_id: ref_id,
mate_pos: mate_pos_1based.saturating_sub(1) as u32,
};
let name_bytes: &[u8] = name;
self.buffer.insert(name_bytes.to_vec(), entry);
}
pub fn flush_behind(&mut self, ref_id: usize, pos: u32) -> Vec<T> {
self.buffer
.extract_if(|_, entry| Self::is_behind(entry, ref_id, pos))
.map(|(_, e)| e.value)
.collect()
}
pub fn flush(&mut self) -> Vec<T> {
self.buffer.drain().map(|(_, e)| e.value).collect()
}
pub fn clear_behind(&mut self, ref_id: usize, pos: u32) {
self.buffer.retain(|_, entry| !Self::is_behind(entry, ref_id, pos));
}
pub fn clear(&mut self) {
self.buffer.clear();
}
#[must_use]
pub fn len(&self) -> usize {
self.buffer.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
fn is_behind(entry: &Entry<T>, ref_id: usize, pos: u32) -> bool {
entry.mate_ref_id < ref_id || (entry.mate_ref_id == ref_id && entry.mate_pos < pos)
}
}
impl<T: MateCache> MateBuffer<T> {
#[allow(clippy::cast_possible_truncation, reason = "1-based genomic positions fit in u32")]
pub fn accept(&mut self, record: &RikerRecord) -> MateAction<T> {
let flags = record.flags();
if !flags.is_segmented() || flags.is_unmapped() || flags.is_mate_unmapped() {
return MateAction::Alone;
}
let Some(ref_id) = record.reference_sequence_id() else {
return MateAction::Alone;
};
if Some(ref_id) != record.mate_reference_sequence_id() {
return MateAction::Alone;
}
let Some(name) = record.name() else {
return MateAction::Alone;
};
let name_bytes: &[u8] = name.as_ref();
if let Some(entry) = self.buffer.remove(name_bytes) {
return MateAction::PairWith(entry.value);
}
let (Some(read_start_pos), Some(read_end_pos), Some(mate_start_pos)) =
(record.alignment_start(), record.alignment_end(), record.mate_alignment_start())
else {
return MateAction::Alone;
};
let read_start = read_start_pos.get();
let read_end = read_end_pos.get();
let mate_start_1based = mate_start_pos.get();
if mate_start_1based >= read_start && mate_start_1based <= read_end {
let entry = Entry {
value: T::from_record(record),
mate_ref_id: ref_id,
mate_pos: mate_start_1based.saturating_sub(1) as u32,
};
self.buffer.insert(<[u8]>::to_vec(name_bytes), entry);
return MateAction::Buffered;
}
MateAction::Alone
}
}
impl<T> Default for MateBuffer<T> {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use noodles::core::Position;
use noodles::sam::Header;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::cigar::Op;
use noodles::sam::alignment::record::{Flags, cigar::op::Kind};
use noodles::sam::alignment::record_buf::{Cigar, QualityScores, Sequence};
fn to_riker(record: &RecordBuf) -> RikerRecord {
RikerRecord::from_alignment_record(&Header::default(), record).unwrap()
}
fn paired_record(
name: &[u8],
ref_id: usize,
pos_1based: usize,
mate_pos_1based: usize,
) -> RecordBuf {
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
RecordBuf::builder()
.set_name(name.to_vec())
.set_flags(Flags::SEGMENTED)
.set_reference_sequence_id(ref_id)
.set_mate_reference_sequence_id(ref_id)
.set_alignment_start(Position::new(pos_1based).expect("pos"))
.set_mate_alignment_start(Position::new(mate_pos_1based).expect("mate_pos"))
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 100]))
.set_quality_scores(QualityScores::from(vec![30u8; 100]))
.build()
}
fn unpaired_record(pos_1based: usize) -> RecordBuf {
let cigar: Cigar = [Op::new(Kind::Match, 50)].into_iter().collect();
RecordBuf::builder()
.set_name(b"unpaired".to_vec())
.set_flags(Flags::empty())
.set_reference_sequence_id(0)
.set_alignment_start(Position::new(pos_1based).expect("pos"))
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 50]))
.set_quality_scores(QualityScores::from(vec![30u8; 50]))
.build()
}
#[derive(Debug)]
struct NameCache(Vec<u8>);
impl MateCache for NameCache {
fn from_record(record: &RikerRecord) -> Self {
Self(record.name().map_or_else(Vec::new, |n| <[u8]>::to_vec(n)))
}
}
#[test]
fn test_accept_unpaired_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = unpaired_record(100);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_unmapped_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let mut r = paired_record(b"q", 0, 100, 120);
*r.flags_mut() = Flags::SEGMENTED | Flags::UNMAPPED;
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_mate_unmapped_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let mut r = paired_record(b"q", 0, 100, 120);
*r.flags_mut() = Flags::SEGMENTED | Flags::MATE_UNMAPPED;
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_cross_contig_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let mut r = paired_record(b"q", 0, 100, 120);
*r.mate_reference_sequence_id_mut() = Some(1);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
fn nameless_paired_record(
ref_id: usize,
pos_1based: usize,
mate_pos_1based: usize,
) -> RecordBuf {
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
RecordBuf::builder()
.set_flags(Flags::SEGMENTED)
.set_reference_sequence_id(ref_id)
.set_mate_reference_sequence_id(ref_id)
.set_alignment_start(Position::new(pos_1based).expect("pos"))
.set_mate_alignment_start(Position::new(mate_pos_1based).expect("mate_pos"))
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 100]))
.set_quality_scores(QualityScores::from(vec![30u8; 100]))
.build()
}
#[test]
fn test_accept_nameless_records_are_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r1 = nameless_paired_record(0, 100, 150);
let r2 = nameless_paired_record(0, 100, 150);
assert!(matches!(buf.accept(&to_riker(&r1)), MateAction::Alone));
assert!(matches!(buf.accept(&to_riker(&r2)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_probe_nameless_records_are_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = nameless_paired_record(0, 100, 150);
assert!(matches!(buf.probe(&to_riker(&r)), Peek::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_mate_before_read_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 50);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_mate_past_read_end_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 500);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_accept_mate_in_span_is_buffered() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 150);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Buffered));
assert_eq!(buf.len(), 1);
}
#[test]
fn test_accept_mate_at_read_start_is_buffered() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 100);
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Buffered));
}
#[test]
fn test_accept_mate_at_read_end_is_buffered() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 199); assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Buffered));
}
#[test]
fn test_accept_pair_returns_cached_mate() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let first = paired_record(b"qname", 0, 100, 150);
assert!(matches!(buf.accept(&to_riker(&first)), MateAction::Buffered));
let second = paired_record(b"qname", 0, 150, 100);
match buf.accept(&to_riker(&second)) {
MateAction::PairWith(cache) => assert_eq!(cache.0, b"qname"),
other => panic!("expected PairWith, got {other:?}"),
}
assert!(buf.is_empty());
}
#[test]
fn test_flush_behind_yields_orphans_on_earlier_contig() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q1", 0, 100, 150);
let _ = buf.accept(&to_riker(&r));
let orphans = buf.flush_behind(1, 0);
assert_eq!(orphans.len(), 1);
assert!(buf.is_empty());
}
#[test]
fn test_flush_behind_yields_orphans_past_pos_on_same_contig() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let _ = buf.accept(&to_riker(&paired_record(b"q2", 0, 700, 750)));
let orphans = buf.flush_behind(0, 500);
assert_eq!(orphans.len(), 1);
assert_eq!(buf.len(), 1);
}
#[test]
fn test_flush_behind_empty_when_none_match() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let orphans = buf.flush_behind(0, 0);
assert!(orphans.is_empty());
assert_eq!(buf.len(), 1);
}
#[test]
fn test_flush_behind_is_strictly_before() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let stay = buf.flush_behind(0, 149);
assert!(stay.is_empty());
assert_eq!(buf.len(), 1);
let gone = buf.flush_behind(0, 150);
assert_eq!(gone.len(), 1);
assert!(buf.is_empty());
}
#[test]
fn test_flush_drains_all() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let _ = buf.accept(&to_riker(&paired_record(b"q2", 0, 200, 250)));
let all = buf.flush();
assert_eq!(all.len(), 2);
assert!(buf.is_empty());
}
#[test]
fn test_clear_behind_discards_without_yield() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let _ = buf.accept(&to_riker(&paired_record(b"q2", 0, 700, 750)));
buf.clear_behind(0, 500);
assert_eq!(buf.len(), 1);
}
#[test]
fn test_clear_empties_buffer() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let _ = buf.accept(&to_riker(&paired_record(b"q1", 0, 100, 150)));
let _ = buf.accept(&to_riker(&paired_record(b"q2", 0, 200, 250)));
buf.clear();
assert!(buf.is_empty());
}
#[test]
fn test_rikerrecord_cache_roundtrip() {
let mut buf: MateBuffer<RikerRecord> = MateBuffer::new();
let first = paired_record(b"qname", 0, 100, 150);
assert!(matches!(buf.accept(&to_riker(&first)), MateAction::Buffered));
let second = paired_record(b"qname", 0, 150, 100);
match buf.accept(&to_riker(&second)) {
MateAction::PairWith(cached) => {
let name = cached.name().unwrap();
assert_eq!(&**name, b"qname");
assert_eq!(cached.alignment_start().unwrap().get(), 100);
}
_ => panic!("expected PairWith"),
}
}
#[test]
fn test_probe_unpaired_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = unpaired_record(100);
assert!(matches!(buf.probe(&to_riker(&r)), Peek::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_probe_mate_in_span_reports_overlap_coords() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let r = paired_record(b"q", 0, 100, 150);
match buf.probe(&to_riker(&r)) {
Peek::WouldBuffer { overlap_start, overlap_len } => {
assert_eq!(overlap_start, 149);
assert_eq!(overlap_len, 50);
}
_ => panic!("expected WouldBuffer"),
}
assert!(buf.is_empty());
}
#[test]
fn test_probe_then_insert_makes_entry_visible_to_accept() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let first = paired_record(b"qn", 0, 100, 150);
let Peek::WouldBuffer { .. } = buf.probe(&to_riker(&first)) else {
panic!("expected WouldBuffer")
};
buf.insert(&to_riker(&first), NameCache(b"qn".to_vec()));
assert_eq!(buf.len(), 1);
let second = paired_record(b"qn", 0, 150, 100);
match buf.accept(&to_riker(&second)) {
MateAction::PairWith(cache) => assert_eq!(cache.0, b"qn"),
_ => panic!("expected PairWith"),
}
assert!(buf.is_empty());
}
#[test]
fn test_accept_paired_without_alignment_start_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
let r = RecordBuf::builder()
.set_name(b"q".to_vec())
.set_flags(Flags::SEGMENTED)
.set_reference_sequence_id(0)
.set_mate_reference_sequence_id(0)
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 100]))
.set_quality_scores(QualityScores::from(vec![30u8; 100]))
.build();
assert!(matches!(buf.accept(&to_riker(&r)), MateAction::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_probe_paired_without_alignment_start_is_alone() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let cigar: Cigar = [Op::new(Kind::Match, 100)].into_iter().collect();
let r = RecordBuf::builder()
.set_name(b"q".to_vec())
.set_flags(Flags::SEGMENTED)
.set_reference_sequence_id(0)
.set_mate_reference_sequence_id(0)
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; 100]))
.set_quality_scores(QualityScores::from(vec![30u8; 100]))
.build();
assert!(matches!(buf.probe(&to_riker(&r)), Peek::Alone));
assert!(buf.is_empty());
}
#[test]
fn test_probe_consumes_buffered_mate() {
let mut buf: MateBuffer<NameCache> = MateBuffer::new();
let first = paired_record(b"qn", 0, 100, 150);
let Peek::WouldBuffer { .. } = buf.probe(&to_riker(&first)) else {
panic!("expected WouldBuffer")
};
buf.insert(&to_riker(&first), NameCache(b"qn".to_vec()));
let second = paired_record(b"qn", 0, 150, 100);
assert!(matches!(buf.probe(&to_riker(&second)), Peek::PairWith(_)));
assert!(buf.is_empty());
}
}