use std::collections::BTreeMap;
use anyhow::Result;
use fgumi_raw_bam::RawRecord;
use crate::raw_writer::RawBamWriter;
use crate::reference::{BASE_A, BASE_C, BASE_G, BASE_T, Context, RefCodes, RefEncoding, Reference};
use crate::{ContextMask, TagSpec};
pub(crate) const FLAG_PAIRED: u16 = 0x1;
pub(crate) const FLAG_UNMAPPED: u16 = 0x4;
pub(crate) const FLAG_REVERSE: u16 = 0x10;
pub(crate) const FLAG_FIRST_SEGMENT: u16 = 0x40;
pub(crate) const FLAG_LAST_SEGMENT: u16 = 0x80;
pub(crate) const FLAG_SECONDARY: u16 = 0x100;
pub(crate) const FLAG_QC_FAIL: u16 = 0x200;
pub(crate) const FLAG_SUPPLEMENTARY: u16 = 0x800;
pub(crate) struct RecordProcessor {
reference: Reference,
opts: ProcessorOptions,
has_controls: bool,
}
impl RecordProcessor {
#[must_use]
pub(crate) fn new(reference: Reference, opts: ProcessorOptions) -> Self {
let has_controls = opts.scope_of_tid.iter().any(Option::is_some);
Self { reference, opts, has_controls }
}
pub(crate) fn process_block(
&self,
block: &mut [RawRecord],
stats: &mut Stats,
out: &mut RawBamWriter,
) -> Result<()> {
stats.total_templates += 1;
let class_idx = self.classification_index(block)?;
let class_rec = &block[class_idx];
if has(class_rec.flags(), FLAG_UNMAPPED) {
stats.genome.n_templates += 1;
stats.unmapped_templates += 1;
return self.emit(block, Action::PassThrough, (0, 0), out);
}
let tid = class_rec.ref_id();
let scope_idx = self.opts.scope_of_tid.get(tid as usize).copied().flatten();
let overlap = self.overlap_skip(block);
let trimming = self.opts.ignore_template_ends > 0;
let termini =
if trimming { self.template_termini(block) } else { TemplateTermini::default() };
let mut counters = PerContextCounters::default();
for (i, rec) in block.iter().enumerate() {
if self.is_evidence_record(rec) {
let (trim_lo, trim_hi, mate_terminus) = if trimming {
let (lo, hi) = termini.own_trim_for(rec, self.opts.ignore_template_ends);
(lo, hi, termini.mate_skip_for(rec))
} else {
(0, 0, None)
};
let overlap_iv = overlap.and_then(|(i1, i2, (os, oe))| {
if i != i1 && i != i2 {
return None;
}
if trimming {
return (i == i2).then_some((os, oe));
}
let mid = os + (oe - os) / 2;
Some(if has(rec.flags(), FLAG_REVERSE) { (os, mid) } else { (mid, oe) })
});
let trim = RecordTrim { trim_lo, trim_hi, skip: overlap_iv.or(mate_terminus) };
self.tally_record(rec, trim, &mut counters);
}
}
let saw_control_supp = self.has_controls
&& scope_idx.is_none()
&& block.iter().any(|rec| {
let f = rec.flags();
has(f, FLAG_SUPPLEMENTARY) && !has(f, FLAG_UNMAPPED) && {
let rtid = rec.ref_id();
rtid >= 0
&& self.opts.scope_of_tid.get(rtid as usize).copied().flatten().is_some()
}
});
let monitored = counters.monitored_total();
let counts =
(counters.unconv_in(self.opts.contexts), counters.total_in(self.opts.contexts));
match scope_idx {
Some(ci) => {
let scope = &mut stats.controls[ci];
scope.n_templates += 1;
if monitored > 0 {
scope.n_evaluated += 1;
}
scope.counters.add(&counters);
self.emit(block, Action::PassThrough, counts, out)
}
None => {
stats.genome.n_templates += 1;
stats.genome.counters.add(&counters);
if self.opts.record_matrix {
*stats.conversion_matrix.entry((counts.1, counts.0)).or_insert(0) += 1;
}
if saw_control_supp {
stats.chimeric_to_control_templates += 1;
}
if monitored == 0 {
stats.zero_site_templates += 1;
} else {
stats.genome.n_evaluated += 1;
if counts.1 > 0 && counts.1 < u64::from(self.opts.min_sites) {
stats.below_min_sites_templates += 1;
}
}
let unconverted = self.decide(&counters);
let action = if unconverted {
stats.genome.n_unconverted += 1;
if self.opts.remove_unconverted {
stats.genome.n_removed += 1;
Action::Remove
} else {
Action::Mark
}
} else {
Action::PassThrough
};
self.emit(block, action, counts, out)
}
}
}
fn classification_index(&self, block: &[RawRecord]) -> Result<usize> {
let mut r1_primary = None;
let mut unpaired_primary = None;
let mut any_primary = None;
for (i, rec) in block.iter().enumerate() {
let f = rec.flags();
if has(f, FLAG_SECONDARY | FLAG_SUPPLEMENTARY) {
continue;
}
any_primary.get_or_insert(i);
if !has(f, FLAG_PAIRED) {
unpaired_primary.get_or_insert(i);
} else if has(f, FLAG_FIRST_SEGMENT) {
r1_primary.get_or_insert(i);
}
}
r1_primary.or(unpaired_primary).or(any_primary).ok_or_else(|| {
let qname = block.first().map(|r| r.read_name().to_vec()).unwrap_or_default();
anyhow::anyhow!(
"QNAME {} appeared with {} record(s) but no primary alignment — the primary must \
be elsewhere in the stream. This almost always means the input is not \
query-grouped (e.g. coordinate-sorted). Re-sort with `samtools sort -n` and \
re-run.",
String::from_utf8_lossy(&qname),
block.len(),
)
})
}
#[inline]
fn is_evidence_record(&self, rec: &RawRecord) -> bool {
let f = rec.flags();
if has(f, FLAG_UNMAPPED) || has(f, FLAG_SECONDARY) {
return false;
}
if has(f, FLAG_SUPPLEMENTARY) && self.opts.ignore_supplementary_evidence {
return false;
}
true
}
fn overlap_skip(&self, block: &[RawRecord]) -> Option<(usize, usize, (usize, usize))> {
let mut r1 = None;
let mut r2 = None;
for (i, rec) in block.iter().enumerate() {
let f = rec.flags();
if has(f, FLAG_SECONDARY | FLAG_SUPPLEMENTARY | FLAG_UNMAPPED) || !has(f, FLAG_PAIRED) {
continue;
}
if has(f, FLAG_FIRST_SEGMENT) {
r1.get_or_insert(i);
} else if has(f, FLAG_LAST_SEGMENT) {
r2.get_or_insert(i);
}
}
let (i1, i2) = (r1?, r2?);
let (a, b) = (&block[i1], &block[i2]);
if a.ref_id() < 0 || a.ref_id() != b.ref_id() {
return None;
}
if monitor_c_of(a.flags()) != monitor_c_of(b.flags()) {
return None;
}
let a1 = a.pos() as usize;
let b1 = a1 + a.reference_length().max(0) as usize;
let a2 = b.pos() as usize;
let b2 = a2 + b.reference_length().max(0) as usize;
let start = a1.max(a2);
let end = b1.min(b2);
if start < end { Some((i1, i2, (start, end))) } else { None }
}
#[inline(never)]
fn template_termini(&self, block: &[RawRecord]) -> TemplateTermini {
let n = self.opts.ignore_template_ends as usize;
if n == 0 {
return TemplateTermini::default();
}
let mut r1: Option<&RawRecord> = None;
let mut r2: Option<&RawRecord> = None;
let mut unpaired: Option<&RawRecord> = None;
for rec in block {
let f = rec.flags();
if has(f, FLAG_SECONDARY | FLAG_SUPPLEMENTARY | FLAG_UNMAPPED) {
continue;
}
if !has(f, FLAG_PAIRED) {
unpaired.get_or_insert(rec);
} else if has(f, FLAG_FIRST_SEGMENT) {
r1.get_or_insert(rec);
} else if has(f, FLAG_LAST_SEGMENT) {
r2.get_or_insert(rec);
}
}
let tag =
|rec: &RawRecord, span: Option<(usize, usize)>| span.map(|(s, e)| (rec.ref_id(), s, e));
let five = |rec: &RawRecord| tag(rec, five_prime_ref_span(rec, n));
let three = |rec: &RawRecord| tag(rec, three_prime_ref_span(rec, n));
if let Some(rec) = unpaired {
return TemplateTermini { r1: five(rec), r2: three(rec), single: true };
}
match (r1, r2) {
(Some(a), Some(b)) => TemplateTermini { r1: five(a), r2: five(b), single: false },
(Some(m), None) | (None, Some(m)) => {
TemplateTermini { r1: five(m), r2: three(m), single: true }
}
(None, None) => TemplateTermini::default(),
}
}
fn tally_record(&self, rec: &RawRecord, trim: RecordTrim, counters: &mut PerContextCounters) {
let tid = rec.ref_id();
match self.reference.encoding() {
RefEncoding::Bytes => {
if let Some(c) = self.reference.byte_codes(tid) {
self.tally_aligned(rec, c, trim, counters);
}
}
RefEncoding::Nibble => {
if let Some(c) = self.reference.nibble_codes(tid) {
self.tally_aligned(rec, c, trim, counters);
}
}
RefEncoding::TwoBit => {
if let Some(c) = self.reference.twobit_codes(tid) {
self.tally_aligned(rec, c, trim, counters);
}
}
}
}
fn tally_aligned<R: RefCodes + Copy>(
&self,
rec: &RawRecord,
refc: R,
trim: RecordTrim,
counters: &mut PerContextCounters,
) {
let seq_len = rec.l_seq() as usize;
if seq_len == 0 {
return;
}
let ref_len = refc.len();
let f = rec.flags();
let monitor_c = monitor_c_of(f);
let keep_lo = trim.trim_lo;
let keep_hi = seq_len.saturating_sub(trim.trim_hi);
let min_bq = self.opts.min_base_quality;
let monitored_base = if monitor_c { BASE_C } else { BASE_G };
let mut read_pos: usize = 0;
let pos = rec.pos();
if pos < 0 {
return;
}
let mut ref_pos: usize = pos as usize;
let params = SpanParams {
monitor_c,
monitored_base,
min_bq,
keep_lo,
keep_hi,
seq_len,
ref_len,
skip: trim.skip,
};
for op in rec.cigar_ops_iter() {
let len = (op >> 4) as usize;
let code = op & 0xf;
match code {
0 | 7 | 8 => {
tally_span(rec, refc, read_pos, ref_pos, len, ¶ms, counters);
read_pos += len;
ref_pos += len;
}
1 => read_pos += len,
4 => read_pos += len,
2 | 3 => ref_pos += len,
_ => {}
}
}
}
fn decide(&self, counters: &PerContextCounters) -> bool {
let unconv = counters.unconv_in(self.opts.contexts);
let monitored = counters.total_in(self.opts.contexts);
self.classify(unconv, monitored).0
}
pub(crate) fn classify(&self, unconv: u64, monitored: u64) -> (bool, DecidedBy) {
if monitored == 0 {
return (false, DecidedBy::ZeroSites);
}
let count_hit = unconv >= u64::from(self.opts.max_unconverted_count);
let at_floor = monitored >= u64::from(self.opts.min_sites);
let frac_hit =
at_floor && (unconv as f64) / (monitored as f64) > self.opts.max_unconverted_fraction;
match self.opts.mode {
DecisionMode::Count => (count_hit, DecidedBy::Count),
DecisionMode::Proportion => {
if at_floor {
(frac_hit, DecidedBy::Proportion)
} else {
(false, DecidedBy::MinSitesFloor)
}
}
DecisionMode::Either => (count_hit || frac_hit, DecidedBy::Either),
DecisionMode::Adaptive => {
if at_floor {
(frac_hit, DecidedBy::Proportion)
} else {
(count_hit, DecidedBy::Count)
}
}
}
}
fn emit(
&self,
block: &mut [RawRecord],
action: Action,
counts: (u64, u64),
out: &mut RawBamWriter,
) -> Result<()> {
if action == Action::Remove {
return Ok(());
}
let count_value = self.opts.count_tag.map(|_| format!("{}/{}", counts.0, counts.1));
for rec in block.iter_mut() {
if action == Action::Mark {
if self.opts.qc_fail {
rec.set_flags(rec.flags() | FLAG_QC_FAIL);
}
if rec.tags().find_string(&self.opts.tag.tag).is_none() {
rec.tags_editor().append_string(&self.opts.tag.tag, &self.opts.tag.value);
}
}
if let (Some(tag), Some(value)) = (&self.opts.count_tag, &count_value)
&& rec.tags().find_string(tag).is_none()
{
rec.tags_editor().append_string(tag, value.as_bytes());
}
out.write_record(rec)?;
}
Ok(())
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub(crate) enum DecisionMode {
Count,
Proportion,
Either,
Adaptive,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub(crate) enum DecidedBy {
ZeroSites,
Count,
Proportion,
MinSitesFloor,
Either,
}
impl DecidedBy {
pub(crate) fn as_str(self) -> &'static str {
match self {
DecidedBy::ZeroSites => "zero_sites",
DecidedBy::Count => "count",
DecidedBy::Proportion => "proportion",
DecidedBy::MinSitesFloor => "min_sites_floor",
DecidedBy::Either => "either",
}
}
}
pub(crate) struct ProcessorOptions {
pub(crate) contexts: ContextMask,
pub(crate) mode: DecisionMode,
pub(crate) max_unconverted_count: u32,
pub(crate) max_unconverted_fraction: f64,
pub(crate) min_sites: u32,
pub(crate) min_base_quality: u8,
pub(crate) ignore_template_ends: u32,
pub(crate) ignore_supplementary_evidence: bool,
pub(crate) tag: TagSpec,
pub(crate) count_tag: Option<[u8; 2]>,
pub(crate) qc_fail: bool,
pub(crate) remove_unconverted: bool,
pub(crate) scope_of_tid: Vec<Option<usize>>,
pub(crate) record_matrix: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Action {
PassThrough,
Mark,
Remove,
}
#[derive(Debug, Clone, Copy, Default)]
struct TemplateTermini {
r1: Option<(i32, usize, usize)>,
r2: Option<(i32, usize, usize)>,
single: bool,
}
impl TemplateTermini {
#[inline(never)]
fn own_trim_for(&self, rec: &RawRecord, n: u32) -> (usize, usize) {
let n = n as usize;
if n == 0 {
return (0, 0);
}
if self.single {
(n, n)
} else if has(rec.flags(), FLAG_REVERSE) {
(0, n)
} else {
(n, 0)
}
}
#[inline(never)]
fn mate_skip_for(&self, rec: &RawRecord) -> Option<(usize, usize)> {
if self.single {
return None;
}
let f = rec.flags();
let mate = if has(f, FLAG_FIRST_SEGMENT) {
self.r2
} else if has(f, FLAG_LAST_SEGMENT) {
self.r1
} else {
return None;
};
let (ttid, s, e) = mate?;
if ttid != rec.ref_id() {
return None;
}
let rstart = rec.pos().max(0) as usize;
let rend = rstart + rec.reference_length().max(0) as usize;
(s < rend && rstart < e).then_some((s, e))
}
}
#[derive(Debug, Clone, Copy, Default)]
struct RecordTrim {
trim_lo: usize,
trim_hi: usize,
skip: Option<(usize, usize)>,
}
#[derive(Debug, Clone)]
pub(crate) struct Stats {
pub(crate) genome: ScopeStats,
pub(crate) controls: Vec<ScopeStats>,
pub(crate) chimeric_to_control_templates: u64,
pub(crate) unmapped_templates: u64,
pub(crate) zero_site_templates: u64,
pub(crate) below_min_sites_templates: u64,
pub(crate) total_templates: u64,
pub(crate) conversion_matrix: BTreeMap<(u64, u64), u64>,
}
impl Stats {
#[must_use]
pub(crate) fn new(control_names: &[String]) -> Self {
Self {
genome: ScopeStats::new("genome".to_string()),
controls: control_names.iter().map(|n| ScopeStats::new(n.clone())).collect(),
chimeric_to_control_templates: 0,
unmapped_templates: 0,
zero_site_templates: 0,
below_min_sites_templates: 0,
total_templates: 0,
conversion_matrix: BTreeMap::new(),
}
}
}
#[derive(Debug, Clone)]
pub(crate) struct ScopeStats {
pub(crate) name: String,
pub(crate) n_templates: u64,
pub(crate) n_evaluated: u64,
pub(crate) n_unconverted: u64,
pub(crate) n_removed: u64,
pub(crate) counters: PerContextCounters,
}
impl ScopeStats {
fn new(name: String) -> Self {
Self {
name,
n_templates: 0,
n_evaluated: 0,
n_unconverted: 0,
n_removed: 0,
counters: PerContextCounters::default(),
}
}
}
#[derive(Debug, Default, Clone, Copy)]
pub(crate) struct PerContextCounters {
pub(crate) unconv: [u64; 4],
pub(crate) total: [u64; 4],
}
impl PerContextCounters {
#[inline]
pub(crate) fn record(&mut self, ctx: Context, unconverted: bool) {
let i = ctx.index();
self.total[i] += 1;
if unconverted {
self.unconv[i] += 1;
}
}
pub(crate) fn add(&mut self, other: &PerContextCounters) {
for i in 0..4 {
self.unconv[i] += other.unconv[i];
self.total[i] += other.total[i];
}
}
#[must_use]
pub(crate) fn unconv_in(&self, mask: ContextMask) -> u64 {
Context::ALL.iter().filter(|c| mask.contains(**c)).map(|c| self.unconv[c.index()]).sum()
}
#[must_use]
pub(crate) fn total_in(&self, mask: ContextMask) -> u64 {
Context::ALL.iter().filter(|c| mask.contains(**c)).map(|c| self.total[c.index()]).sum()
}
#[must_use]
pub(crate) fn monitored_total(&self) -> u64 {
self.total.iter().sum()
}
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct SpanParams {
pub(crate) monitor_c: bool,
pub(crate) monitored_base: u8,
pub(crate) min_bq: u8,
pub(crate) keep_lo: usize,
pub(crate) keep_hi: usize,
pub(crate) seq_len: usize,
pub(crate) ref_len: usize,
pub(crate) skip: Option<(usize, usize)>,
}
#[inline]
fn tally_site<R: RefCodes>(
rec: &RawRecord,
refc: R,
rp: usize,
gp: usize,
p: &SpanParams,
counters: &mut PerContextCounters,
) {
if rec.get_qual(rp) < p.min_bq {
return;
}
let ctx = if p.monitor_c {
if gp + 1 >= p.ref_len {
return;
}
refc.ctx_top(gp + 1)
} else {
if gp == 0 {
return;
}
refc.ctx_bottom(gp - 1)
};
let Some(ctx) = ctx else { return };
let readb = rec.get_base(rp);
let unconverted = if p.monitor_c {
match readb {
BASE_C => true, BASE_T => false, _ => return, }
} else {
match readb {
BASE_G => true,
BASE_A => false,
_ => return,
}
};
counters.record(ctx, unconverted);
}
#[inline]
#[allow(clippy::too_many_arguments)]
fn tally_run<R: RefCodes + Copy>(
rec: &RawRecord,
refc: R,
rp_start: usize,
gp_start: usize,
lo: usize,
hi: usize,
p: &SpanParams,
counters: &mut PerContextCounters,
) {
for k in lo..hi {
let gp = gp_start + k;
if !refc.monitors(gp, p.monitored_base) {
continue;
}
tally_site(rec, refc, rp_start + k, gp, p, counters);
}
}
pub(crate) fn tally_span<R: RefCodes + Copy>(
rec: &RawRecord,
refc: R,
rp_start: usize,
gp_start: usize,
len: usize,
p: &SpanParams,
counters: &mut PerContextCounters,
) {
let k0 = p.keep_lo.saturating_sub(rp_start);
let k1 = len
.min(p.keep_hi.saturating_sub(rp_start))
.min(p.seq_len.saturating_sub(rp_start))
.min(p.ref_len.saturating_sub(gp_start));
match p.skip {
None => tally_run(rec, refc, rp_start, gp_start, k0, k1, p, counters),
Some((s, e)) => {
let sk0 = s.saturating_sub(gp_start).clamp(k0, k1);
let sk1 = e.saturating_sub(gp_start).clamp(k0, k1);
tally_run(rec, refc, rp_start, gp_start, k0, sk0, p, counters);
tally_run(rec, refc, rp_start, gp_start, sk1, k1, p, counters);
}
}
}
#[inline]
fn has(flags: u16, bit: u16) -> bool {
flags & bit != 0
}
#[inline]
fn monitor_c_of(flags: u16) -> bool {
let treat_as_read1 = has(flags, FLAG_FIRST_SEGMENT) || !has(flags, FLAG_PAIRED);
treat_as_read1 ^ has(flags, FLAG_REVERSE)
}
fn ref_span_for_query_window(rec: &RawRecord, q_lo: usize, q_hi: usize) -> Option<(usize, usize)> {
if q_lo >= q_hi {
return None;
}
let mut qpos = 0usize;
let mut rpos = rec.pos().max(0) as usize;
let mut lo = usize::MAX;
let mut hi = 0usize;
for op in rec.cigar_ops_iter() {
let len = (op >> 4) as usize;
match op & 0xf {
0 | 7 | 8 => {
let a = qpos.max(q_lo);
let b = (qpos + len).min(q_hi);
if a < b {
lo = lo.min(rpos + (a - qpos));
hi = hi.max(rpos + (b - qpos));
}
qpos += len;
rpos += len;
}
1 | 4 => qpos += len,
2 | 3 => rpos += len,
_ => {}
}
if qpos >= q_hi {
break;
}
}
(lo < hi).then_some((lo, hi))
}
fn five_prime_ref_span(rec: &RawRecord, n: usize) -> Option<(usize, usize)> {
if n == 0 {
return None;
}
let seq_len = rec.l_seq() as usize;
let (q_lo, q_hi) =
if has(rec.flags(), FLAG_REVERSE) { (seq_len.saturating_sub(n), seq_len) } else { (0, n) };
ref_span_for_query_window(rec, q_lo, q_hi)
}
fn three_prime_ref_span(rec: &RawRecord, n: usize) -> Option<(usize, usize)> {
if n == 0 {
return None;
}
let seq_len = rec.l_seq() as usize;
let (q_lo, q_hi) =
if has(rec.flags(), FLAG_REVERSE) { (0, n) } else { (seq_len.saturating_sub(n), seq_len) };
ref_span_for_query_window(rec, q_lo, q_hi)
}
#[cfg(test)]
mod tests {
use std::io::{BufRead, Cursor};
use super::*;
use crate::reference::{Reference, encode_ref_base};
use crate::sam_reader::SamReader;
use crate::{TagSpec, parse_contexts};
fn enc(seq: &str) -> Vec<u8> {
seq.bytes().map(encode_ref_base).collect()
}
fn cph_mask() -> ContextMask {
parse_contexts("CpA,CpC,CpT").unwrap()
}
fn opts(contexts: ContextMask, count: u32) -> ProcessorOptions {
ProcessorOptions {
contexts,
mode: DecisionMode::Either,
max_unconverted_count: count,
max_unconverted_fraction: 1.0,
min_sites: 5,
min_base_quality: 0,
ignore_template_ends: 0,
ignore_supplementary_evidence: false,
tag: TagSpec { tag: [b'X', b'X'], value: b"UC".to_vec() },
count_tag: None,
qc_fail: true,
remove_unconverted: false,
scope_of_tid: vec![None],
record_matrix: false,
}
}
fn cph_counters(unconv: u64, total: u64) -> PerContextCounters {
let mut c = PerContextCounters::default();
for i in 0..total {
c.record(Context::CpA, i < unconv);
}
c
}
fn decider(mode: DecisionMode, count: u32, fraction: f64, min_sites: u32) -> RecordProcessor {
let mut o = opts(cph_mask(), count);
o.mode = mode;
o.max_unconverted_fraction = fraction;
o.min_sites = min_sites;
RecordProcessor::new(Reference::from_encoded_contigs(vec![enc("C")]), o)
}
#[test]
fn adaptive_below_floor_uses_count() {
let d = decider(DecisionMode::Adaptive, 3, 0.05, 40);
assert!(d.decide(&cph_counters(3, 4)));
assert!(!d.decide(&cph_counters(2, 4)), "2 < count threshold 3 → not flagged");
}
#[test]
fn adaptive_above_floor_uses_proportion_not_count() {
let d = decider(DecisionMode::Adaptive, 3, 0.05, 40);
assert!(!d.decide(&cph_counters(4, 100)));
assert!(d.decide(&cph_counters(10, 100)), "10% > 5% → flagged");
}
#[test]
fn proportion_below_floor_never_flags() {
let d = decider(DecisionMode::Proportion, 3, 0.05, 40);
assert!(!d.decide(&cph_counters(4, 4)));
assert!(d.decide(&cph_counters(3, 40)), "3/40 = 7.5% > 5% at the floor → flagged");
}
#[test]
fn count_mode_ignores_fraction_and_floor() {
let d = decider(DecisionMode::Count, 3, 0.05, 40);
assert!(d.decide(&cph_counters(4, 100)));
assert!(d.decide(&cph_counters(3, 4)));
}
#[test]
fn either_fires_when_only_one_test_hits() {
let either = decider(DecisionMode::Either, 10, 0.05, 40);
assert!(either.decide(&cph_counters(8, 100)));
assert!(!decider(DecisionMode::Count, 10, 0.05, 40).decide(&cph_counters(8, 100)));
assert!(either.decide(&cph_counters(10, 12)));
}
#[test]
fn adaptive_switch_is_continuous_at_the_default_floor() {
let d = decider(DecisionMode::Adaptive, 3, 0.05, 40);
assert!(d.decide(&cph_counters(3, 39)) && d.decide(&cph_counters(3, 40)));
assert!(!d.decide(&cph_counters(2, 39)) && !d.decide(&cph_counters(2, 40)));
}
#[test]
fn classify_reports_decided_by_and_matches_decide() {
let d = decider(DecisionMode::Adaptive, 3, 0.05, 40);
assert_eq!(d.classify(0, 0), (false, DecidedBy::ZeroSites));
let c = decider(DecisionMode::Count, 3, 0.05, 40);
assert_eq!(c.classify(3, 10), (true, DecidedBy::Count));
assert_eq!(c.classify(2, 10), (false, DecidedBy::Count));
let p = decider(DecisionMode::Proportion, 3, 0.05, 40);
assert_eq!(p.classify(5, 10), (false, DecidedBy::MinSitesFloor));
assert_eq!(p.classify(5, 50), (true, DecidedBy::Proportion));
assert_eq!(d.classify(3, 10), (true, DecidedBy::Count));
assert_eq!(d.classify(1, 50), (false, DecidedBy::Proportion));
let e = decider(DecisionMode::Either, 3, 0.05, 40);
assert_eq!(e.classify(3, 10), (true, DecidedBy::Either));
assert_eq!(e.classify(2, 10), (false, DecidedBy::Either));
for &(u, t) in &[(0, 0), (3, 10), (2, 10), (5, 50), (1, 50), (40, 100)] {
assert_eq!(d.classify(u, t).0, d.decide(&cph_counters(u, t)), "u={u} t={t}");
}
}
fn parse_sam_records(records: &[&str], contig_len: usize) -> Vec<RawRecord> {
let mut sam = format!("@HD\tVN:1.6\tSO:unsorted\n@SQ\tSN:chr1\tLN:{contig_len}\n");
for r in records {
sam.push_str(r);
sam.push('\n');
}
let boxed: Box<dyn BufRead> = Box::new(Cursor::new(sam.into_bytes()));
let mut reader = SamReader::new(boxed);
reader.read_header().expect("read header");
let mut out = Vec::new();
loop {
let mut rec = RawRecord::new();
if !reader.read_record(&mut rec).expect("read record") {
break;
}
out.push(rec);
}
out
}
fn sam_line(flag: u16, pos: u32, cigar: &str, seq: &str, qual: &str) -> String {
format!("q\t{flag}\tchr1\t{pos}\t60\t{cigar}\t*\t0\t0\t{seq}\t{qual}")
}
#[test]
fn forward_read_top_strand_counts_unconverted_cph() {
let reference = Reference::from_encoded_contigs(vec![enc("CACACACACA")]);
let proc = RecordProcessor::new(reference, opts(cph_mask(), 3));
let recs = parse_sam_records(&[&sam_line(0, 1, "10M", "CACACACACA", "IIIIIIIIII")], 10);
let mut counters = PerContextCounters::default();
proc.tally_record(&recs[0], RecordTrim::default(), &mut counters);
assert_eq!(counters.unconv[Context::CpA.index()], 5);
assert_eq!(counters.total[Context::CpA.index()], 5);
assert!(proc.decide(&counters), "5 unconverted CpA ≥ 3 → unconverted");
}
#[test]
fn converted_read_is_not_flagged() {
let reference = Reference::from_encoded_contigs(vec![enc("CACACACACA")]);
let proc = RecordProcessor::new(reference, opts(cph_mask(), 3));
let recs = parse_sam_records(&[&sam_line(0, 1, "10M", "TATATATATA", "IIIIIIIIII")], 10);
let mut counters = PerContextCounters::default();
proc.tally_record(&recs[0], RecordTrim::default(), &mut counters);
assert_eq!(counters.unconv[Context::CpA.index()], 0);
assert_eq!(counters.total[Context::CpA.index()], 5);
assert!(!proc.decide(&counters));
}
#[test]
fn reverse_single_end_read_monitors_ref_g() {
let reference = Reference::from_encoded_contigs(vec![enc("TGTGTGTGTG")]);
let proc = RecordProcessor::new(reference, opts(cph_mask(), 3));
let recs =
parse_sam_records(&[&sam_line(FLAG_REVERSE, 1, "10M", "TGTGTGTGTG", "IIIIIIIIII")], 10);
let mut counters = PerContextCounters::default();
proc.tally_record(&recs[0], RecordTrim::default(), &mut counters);
assert_eq!(counters.unconv[Context::CpA.index()], 5);
assert_eq!(counters.total[Context::CpA.index()], 5);
}
#[test]
fn five_prime_span_forward_and_reverse() {
let q = "I".repeat(10);
let fwd = parse_sam_records(&[&sam_line(0, 1, "10M", "CACACACACA", &q)], 30);
assert_eq!(five_prime_ref_span(&fwd[0], 3), Some((0, 3)));
assert_eq!(three_prime_ref_span(&fwd[0], 3), Some((7, 10)));
let rev = parse_sam_records(&[&sam_line(FLAG_REVERSE, 1, "10M", "CACACACACA", &q)], 30);
assert_eq!(five_prime_ref_span(&rev[0], 3), Some((7, 10)));
assert_eq!(three_prime_ref_span(&rev[0], 3), Some((0, 3)));
}
#[test]
fn five_prime_span_skips_leading_soft_clip() {
let q = "I".repeat(10);
let recs = parse_sam_records(&[&sam_line(0, 1, "3S7M", "GGGCACACAC", &q)], 30);
assert_eq!(five_prime_ref_span(&recs[0], 3), None);
assert_eq!(five_prime_ref_span(&recs[0], 5), Some((0, 2)));
}
#[test]
fn ref_span_stretches_across_deletion() {
let q = "I".repeat(6);
let recs = parse_sam_records(&[&sam_line(0, 1, "2M3D4M", "CACGTA", &q)], 30);
assert_eq!(five_prime_ref_span(&recs[0], 4), Some((0, 7)));
}
}