mod io_threading;
mod raw_reader;
mod raw_writer;
mod reference;
mod sam_reader;
mod sieve;
mod stats;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, Write};
use std::path::PathBuf;
use std::process::ExitCode;
use std::time::Instant;
use anyhow::{Context as _, Result, bail};
use bgzf::CompressionLevel;
use clap::{Parser, ValueEnum};
use fgumi_raw_bam::RawRecord;
use noodles_sam::Header;
use noodles_sam::header::record::value::Map;
use noodles_sam::header::record::value::map::Program;
use noodles_sam::header::record::value::map::program::tag as program_tag;
use crate::raw_reader::RawBamReader;
use crate::raw_writer::RawBamWriter;
use crate::reference::{Context, RefEncoding, Reference};
use crate::sam_reader::SamReader;
use crate::sieve::{DecisionMode, ProcessorOptions, RecordProcessor, Stats};
const METHYLSIEVE_BUILD: &str = env!("CARGO_PKG_VERSION");
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)]
pub(crate) enum RefEncodingCli {
#[value(name = "bytes")]
Bytes,
#[value(name = "nibble")]
Nibble,
#[value(name = "twobit")]
TwoBit,
}
impl From<RefEncodingCli> for RefEncoding {
fn from(c: RefEncodingCli) -> Self {
match c {
RefEncodingCli::Bytes => RefEncoding::Bytes,
RefEncodingCli::Nibble => RefEncoding::Nibble,
RefEncodingCli::TwoBit => RefEncoding::TwoBit,
}
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)]
pub(crate) enum DecisionModeCli {
#[value(name = "count")]
Count,
#[value(name = "proportion")]
Proportion,
#[value(name = "either")]
Either,
#[value(name = "adaptive")]
Adaptive,
}
impl From<DecisionModeCli> for DecisionMode {
fn from(c: DecisionModeCli) -> Self {
match c {
DecisionModeCli::Count => DecisionMode::Count,
DecisionModeCli::Proportion => DecisionMode::Proportion,
DecisionModeCli::Either => DecisionMode::Either,
DecisionModeCli::Adaptive => DecisionMode::Adaptive,
}
}
}
fn parse_compression_level(s: &str) -> Result<CompressionLevel, String> {
let n: u8 = s.parse().map_err(|e| format!("not a u8: {e}"))?;
CompressionLevel::new(n).map_err(|e| format!("{e}"))
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct ContextMask([bool; 4]);
impl ContextMask {
#[inline]
#[must_use]
pub(crate) fn contains(self, ctx: Context) -> bool {
self.0[ctx.index()]
}
}
pub(crate) fn parse_contexts(s: &str) -> Result<ContextMask, String> {
let mut mask = [false; 4];
let mut any = false;
for tok in s.split(',') {
let t = tok.trim().to_ascii_uppercase();
let ctx = match t.as_str() {
"CPA" | "CA" => Context::CpA,
"CPC" | "CC" => Context::CpC,
"CPG" | "CG" => Context::CpG,
"CPT" | "CT" => Context::CpT,
"" => continue,
other => {
return Err(format!(
"unknown context '{other}'; valid contexts are CpA, CpC, CpG, CpT"
));
}
};
mask[ctx.index()] = true;
any = true;
}
if !any {
return Err("at least one context must be specified".to_string());
}
Ok(ContextMask(mask))
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TagSpec {
pub(crate) tag: [u8; 2],
pub(crate) value: Vec<u8>,
}
fn parse_tag_name(s: &str) -> Result<[u8; 2], String> {
let b = s.as_bytes();
if b.len() != 2 || !b.iter().all(u8::is_ascii_alphanumeric) {
return Err(format!("tag name '{s}' must be exactly two alphanumeric characters"));
}
Ok([b[0], b[1]])
}
fn parse_tag_spec(s: &str) -> Result<TagSpec, String> {
let parts: Vec<&str> = s.splitn(3, ':').collect();
if parts.len() != 3 {
return Err(format!("tag spec '{s}' must be of the form TAG:Z:VALUE (e.g. XX:Z:UC)"));
}
let tag_bytes = parts[0].as_bytes();
if tag_bytes.len() != 2 || !tag_bytes.iter().all(u8::is_ascii_alphanumeric) {
return Err(format!("tag '{}' must be exactly two alphanumeric characters", parts[0]));
}
if !parts[1].eq_ignore_ascii_case("Z") {
return Err(format!(
"only string (Z) tags are supported; got type '{}' in '{s}'",
parts[1]
));
}
if parts[2].is_empty() {
return Err(format!("tag spec '{s}' has an empty value"));
}
Ok(TagSpec { tag: [tag_bytes[0], tag_bytes[1]], value: parts[2].as_bytes().to_vec() })
}
#[derive(Parser, Debug, Clone)]
#[command(name = "methylsieve", version = METHYLSIEVE_BUILD, about = SHORT_ABOUT, long_about = LONG_ABOUT)]
pub struct Args {
#[arg(short = 'i', long = "input", help_heading = "Inputs / outputs")]
pub(crate) input: Option<PathBuf>,
#[arg(short = 'o', long = "output", help_heading = "Inputs / outputs")]
pub(crate) output: Option<PathBuf>,
#[arg(long = "compression-level", default_value = "0", value_parser = parse_compression_level,
help_heading = "Inputs / outputs")]
pub(crate) compression_level: CompressionLevel,
#[arg(short = 'r', long = "reference", help_heading = "Inputs / outputs")]
pub(crate) reference: PathBuf,
#[arg(long = "ref-encoding", value_enum, default_value_t = RefEncodingCli::TwoBit,
help_heading = "Inputs / outputs")]
pub(crate) ref_encoding: RefEncodingCli,
#[arg(long = "contexts", default_value = "CpA,CpC,CpT", value_parser = parse_contexts,
help_heading = "Decision")]
pub(crate) contexts: ContextMask,
#[arg(long = "mode", value_enum, default_value_t = DecisionModeCli::Adaptive,
help_heading = "Decision")]
pub(crate) mode: DecisionModeCli,
#[arg(long = "max-unconverted-count", default_value_t = 3, help_heading = "Decision")]
pub(crate) max_unconverted_count: u32,
#[arg(long = "max-unconverted-fraction", default_value_t = 0.05, help_heading = "Decision")]
pub(crate) max_unconverted_fraction: f64,
#[arg(long = "min-sites", default_value_t = 40, help_heading = "Decision")]
pub(crate) min_sites: u32,
#[arg(long = "min-base-quality", default_value_t = 20, help_heading = "Decision")]
pub(crate) min_base_quality: u8,
#[arg(long = "ignore-template-ends", default_value_t = 0, help_heading = "Decision")]
pub(crate) ignore_template_ends: u32,
#[arg(long = "ignore-supplementary-evidence", help_heading = "Decision")]
pub(crate) ignore_supplementary_evidence: bool,
#[arg(long = "tag", default_value = "XX:Z:UC", value_parser = parse_tag_spec,
help_heading = "Actions on unconverted templates")]
pub(crate) tag: TagSpec,
#[arg(long = "no-qc-fail", help_heading = "Actions on unconverted templates")]
pub(crate) no_qc_fail: bool,
#[arg(long = "remove-unconverted", help_heading = "Actions on unconverted templates")]
pub(crate) remove_unconverted: bool,
#[arg(long = "control-contig", help_heading = "Spike-in controls")]
pub(crate) control_contig: Vec<String>,
#[arg(long = "stats", help_heading = "Stats & misc")]
pub(crate) stats: Option<PathBuf>,
#[arg(long = "conversion-matrix", help_heading = "Stats & misc")]
pub(crate) conversion_matrix: Option<PathBuf>,
#[arg(long = "sample", help_heading = "Stats & misc")]
pub(crate) sample: Option<String>,
#[arg(long = "count-tag", value_parser = parse_tag_name, default_value = "ch",
conflicts_with = "no_count_tag", help_heading = "Stats & misc")]
pub(crate) count_tag: [u8; 2],
#[arg(long = "no-count-tag", help_heading = "Stats & misc")]
pub(crate) no_count_tag: bool,
#[arg(short = 'q', long = "quiet", help_heading = "Stats & misc")]
pub(crate) quiet: bool,
#[arg(long = "check-crc", conflicts_with = "no_check_crc", help_heading = "Stats & misc")]
pub(crate) check_crc: bool,
#[arg(long = "no-check-crc", conflicts_with = "check_crc", help_heading = "Stats & misc")]
pub(crate) no_check_crc: bool,
#[arg(long = "read-buffer-mb", default_value_t = 16, value_parser = clap::value_parser!(u32).range(1..=4096),
help_heading = "Stats & misc")]
pub(crate) read_buffer_mb: u32,
#[arg(long = "write-buffer-mb", default_value_t = 64, value_parser = clap::value_parser!(u32).range(1..=4096),
help_heading = "Stats & misc")]
pub(crate) write_buffer_mb: u32,
}
const SHORT_ABOUT: &str = "Tag or filter unconverted reads in bisulfite / EM-seq SAM/BAM files.";
const LONG_ABOUT: &str = "Tag or filter incompletely-converted reads in directional bisulfite or \
EM-seq data.\n\
Makes one per-template decision using all of a QNAME's primary and supplementary records, \
propagates it to every record, and emits a per-context / per-spike-in conversion-rate TSV. \
Input must be query-grouped; output is always BAM.";
impl Args {
#[must_use]
pub(crate) fn effective_check_crc(&self) -> bool {
if self.check_crc {
return true;
}
if self.no_check_crc {
return false;
}
matches!(self.input.as_deref(), Some(p) if p.to_string_lossy() != "-")
}
#[must_use]
pub(crate) fn qc_fail(&self) -> bool {
!self.no_qc_fail
}
#[must_use]
pub(crate) fn effective_count_tag(&self) -> Option<[u8; 2]> {
if self.no_count_tag { None } else { Some(self.count_tag) }
}
pub(crate) fn validate(&self) -> Result<()> {
if let Some(p) = &self.output {
let s = p.to_string_lossy();
if s != "-" && !s.ends_with(".bam") {
bail!(
"output path {} must end in `.bam` (methylsieve only writes BAM); \
use `-` to send BAM to stdout",
p.display()
);
}
}
let mut stdout_streams: Vec<&str> = Vec::new();
let bam_to_stdout = match self.output.as_deref() {
None => true,
Some(p) => p.to_string_lossy() == "-",
};
if bam_to_stdout {
stdout_streams.push("the BAM output");
}
if matches!(self.stats.as_deref(), Some(p) if p.to_string_lossy() == "-") {
stdout_streams.push("--stats");
}
if matches!(self.conversion_matrix.as_deref(), Some(p) if p.to_string_lossy() == "-") {
stdout_streams.push("--conversion-matrix");
}
if stdout_streams.len() > 1 {
bail!(
"{} are all directed to stdout; at most one stream may use stdout, or \
they would interleave. The BAM goes to stdout by default — send it to a \
file with `-o out.bam`, or write --stats / --conversion-matrix to a path.",
stdout_streams.join(" and ")
);
}
if !(self.max_unconverted_fraction > 0.0 && self.max_unconverted_fraction <= 1.0) {
bail!(
"--max-unconverted-fraction must be in (0, 1]; got {}",
self.max_unconverted_fraction
);
}
if self.max_unconverted_count == 0 {
bail!(
"--max-unconverted-count must be >= 1 (a threshold of 0 would tag every \
template). Use --max-unconverted-fraction for fraction-based filtering."
);
}
Ok(())
}
}
fn main() -> ExitCode {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
.format_timestamp(None)
.init();
let args = Args::parse();
match run(args) {
Ok(()) => ExitCode::SUCCESS,
Err(err) => {
eprintln!("methylsieve: {err:#}");
eprintln!("methylsieve: Premature exit (return code 1).");
ExitCode::FAILURE
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Format {
Sam,
Bam,
}
fn detect_format<R: BufRead>(reader: &mut R) -> Result<Format> {
let head = reader.fill_buf().context("reading input to detect format")?;
if head.is_empty() {
bail!("Empty input: no SAM/BAM header detected");
}
match head[0] {
0x1f => Ok(Format::Bam),
b'@' => Ok(Format::Sam),
b => bail!(
"Input doesn't look like SAM or BAM (first byte 0x{:02x}); expected '@' or 0x1f",
b
),
}
}
enum Reader {
Bam(RawBamReader<Box<dyn BufRead>>),
Sam(SamReader<Box<dyn BufRead>>),
}
impl Reader {
fn read_header(&mut self) -> Result<Header> {
match self {
Reader::Bam(r) => r.read_header().context("reading BAM header"),
Reader::Sam(r) => r.read_header().context("reading SAM header"),
}
}
fn read_record(&mut self, rec: &mut RawRecord) -> std::io::Result<bool> {
match self {
Reader::Bam(r) => r.read_record(rec),
Reader::Sam(r) => r.read_record(rec),
}
}
}
fn run(args: Args) -> Result<()> {
log::info!("methylsieve by Fulcrum Genomics - https://github.com/fulcrumgenomics/methylsieve");
args.validate()?;
let started = StartedRun::now();
if !args.quiet {
eprintln!("methylsieve: Version {METHYLSIEVE_BUILD}");
}
let raw_source: Box<dyn std::io::Read + Send> = match args.input.as_deref() {
Some(p) if p.to_string_lossy() != "-" => {
let f = File::open(p).with_context(|| format!("opening {} for read", p.display()))?;
Box::new(f)
}
_ => Box::new(std::io::stdin()),
};
let read_buf_bytes = (args.read_buffer_mb as usize).saturating_mul(1024 * 1024);
let mut reader_box: Box<dyn BufRead> =
Box::new(crate::io_threading::ThreadedReader::new(raw_source, read_buf_bytes));
let input_name =
args.input.as_deref().map(|p| p.display().to_string()).unwrap_or_else(|| "stdin".into());
let input_format = detect_format(&mut reader_box)?;
let check_crc = args.effective_check_crc();
let mut reader: Reader = match input_format {
Format::Bam => {
if !args.quiet {
eprintln!(
"methylsieve: Reading BAM from {input_name} (CRC verify: {}).",
if check_crc { "on" } else { "off" }
);
}
Reader::Bam(RawBamReader::new(reader_box, check_crc))
}
Format::Sam => {
if !args.quiet {
eprintln!("methylsieve: Reading SAM from {input_name}.");
}
Reader::Sam(SamReader::new(reader_box))
}
};
let mut header = reader.read_header()?;
if header.reference_sequences().is_empty() {
bail!("Input has no @SQ reference sequences. Exiting.");
}
if !args.quiet {
eprintln!(
"methylsieve: Loaded {} header sequence entries.",
header.reference_sequences().len()
);
}
let reference = Reference::load(&args.reference, &header, args.ref_encoding.into())
.with_context(|| format!("loading reference {}", args.reference.display()))?;
if !args.quiet {
eprintln!("methylsieve: Loaded reference {}.", args.reference.display());
}
let (scope_of_tid, control_names) = resolve_control_scopes(&header, &args.control_contig)?;
append_methylsieve_pg(&mut header)?;
let write_buf_bytes = (args.write_buffer_mb as usize).saturating_mul(1024 * 1024);
let mut out = RawBamWriter::open(
args.output.as_deref(),
&header,
write_buf_bytes,
args.compression_level,
)?;
let opts = ProcessorOptions {
contexts: args.contexts,
mode: args.mode.into(),
max_unconverted_count: args.max_unconverted_count,
max_unconverted_fraction: args.max_unconverted_fraction,
min_sites: args.min_sites,
min_base_quality: args.min_base_quality,
ignore_template_ends: args.ignore_template_ends,
ignore_supplementary_evidence: args.ignore_supplementary_evidence,
tag: args.tag.clone(),
count_tag: args.effective_count_tag(),
qc_fail: args.qc_fail(),
remove_unconverted: args.remove_unconverted,
scope_of_tid,
record_matrix: args.conversion_matrix.is_some(),
};
let processor = RecordProcessor::new(reference, opts);
let mut stats = Stats::new(&control_names);
let mut pool: Vec<RawRecord> = Vec::with_capacity(8);
for_each_block(&mut reader, &mut pool, |block| {
processor.process_block(block, &mut stats, &mut out)
})
.context("processing record block")?;
print_run_stats(&stats, &args);
warn_proportion_blind_spot(&stats, &args);
if let Some(stats_path) = args.stats.as_deref() {
crate::stats::write_to_path(stats_path, &stats, &header, args.sample.as_deref())
.context("writing --stats TSV")?;
}
if let Some(matrix_path) = args.conversion_matrix.as_deref() {
crate::stats::write_matrix_to_path(
matrix_path,
&stats,
&header,
args.sample.as_deref(),
|unconv, monitored| processor.classify(unconv, monitored),
)
.context("writing --conversion-matrix TSV")?;
}
out.finish().context("finishing main output")?;
report(&started, stats.total_templates, args.quiet);
Ok(())
}
fn resolve_control_scopes(
header: &Header,
control_contigs: &[String],
) -> Result<(Vec<Option<usize>>, Vec<String>)> {
let n = header.reference_sequences().len();
let name_to_tid: HashMap<String, usize> = header
.reference_sequences()
.keys()
.enumerate()
.map(|(tid, name)| (String::from_utf8_lossy(name.as_ref()).into_owned(), tid))
.collect();
let mut scope_of_tid = vec![None; n];
let mut control_names = Vec::with_capacity(control_contigs.len());
for (ci, name) in control_contigs.iter().enumerate() {
let tid = *name_to_tid.get(name).ok_or_else(|| {
anyhow::anyhow!(
"--control-contig '{name}' is not present in the input @SQ header lines."
)
})?;
scope_of_tid[tid] = Some(ci);
control_names.push(name.clone());
}
Ok((scope_of_tid, control_names))
}
fn for_each_block(
reader: &mut Reader,
pool: &mut Vec<RawRecord>,
mut on_block: impl FnMut(&mut [RawRecord]) -> Result<()>,
) -> Result<()> {
if pool.is_empty() {
pool.push(RawRecord::new());
}
let mut block_len: usize = 0;
let mut current_qname: Vec<u8> = Vec::new();
loop {
if pool.len() == block_len {
pool.push(RawRecord::new());
}
let read_idx = block_len;
let got = reader.read_record(&mut pool[read_idx]).context("reading record")?;
if !got {
break;
}
let new_qname_differs =
block_len > 0 && pool[read_idx].read_name() != current_qname.as_slice();
if new_qname_differs {
on_block(&mut pool[..block_len])?;
if read_idx != 0 {
pool.swap(0, read_idx);
}
block_len = 1;
current_qname.clear();
current_qname.extend_from_slice(pool[0].read_name());
} else {
if block_len == 0 {
current_qname.clear();
current_qname.extend_from_slice(pool[0].read_name());
}
block_len += 1;
}
}
if block_len > 0 {
on_block(&mut pool[..block_len])?;
}
Ok(())
}
fn append_methylsieve_pg(header: &mut Header) -> Result<()> {
let programs = header.programs().as_ref();
let known_ids: std::collections::HashSet<&[u8]> =
programs.keys().map(|k| k.as_slice()).collect();
for (id, map) in programs.iter() {
if let Some(pp) = map.other_fields().get(&program_tag::PREVIOUS_PROGRAM_ID) {
let pp_bytes = pp.as_ref();
if !known_ids.contains(pp_bytes) {
bail!(
"input header @PG ID:{} has PP:{} but no @PG with that ID exists. \
Strip the broken PP tag (e.g. via `samtools reheader`) before re-running.",
String::from_utf8_lossy(id.as_slice()),
String::from_utf8_lossy(pp_bytes),
);
}
}
}
let cl = command_line_for_pg();
let mut map = Map::<Program>::default();
map.other_fields_mut().insert(program_tag::NAME, "methylsieve".into());
map.other_fields_mut().insert(program_tag::VERSION, METHYLSIEVE_BUILD.into());
map.other_fields_mut().insert(program_tag::COMMAND_LINE, cl.into());
header.programs_mut().add("methylsieve", map).context("appending @PG methylsieve record")?;
Ok(())
}
fn command_line_for_pg() -> String {
let mut args = std::env::args();
let prog = args
.next()
.map(|a| {
std::path::Path::new(&a)
.file_name()
.map(|s| s.to_string_lossy().into_owned())
.unwrap_or(a)
})
.unwrap_or_else(|| "methylsieve".to_string());
let rest: Vec<String> = args.collect();
if rest.is_empty() { prog } else { format!("{prog} {}", rest.join(" ")) }
}
fn print_run_stats(stats: &Stats, args: &Args) {
if stats.total_templates == 0 {
eprintln!("methylsieve: No reads processed.");
return;
}
let g = &stats.genome;
let verb = if args.remove_unconverted { "Removed" } else { "Tagged " };
let pct =
if g.n_evaluated > 0 { 100.0 * g.n_unconverted as f64 / g.n_evaluated as f64 } else { 0.0 };
eprintln!(
"methylsieve: {} {:>10} of {:>10} ({:5.3}%) evaluated genome templates as unconverted.",
verb, g.n_unconverted, g.n_evaluated, pct
);
if stats.unmapped_templates > 0 || stats.zero_site_templates > 0 {
eprintln!(
"methylsieve: {} templates had no mapped primary; {} produced no monitored sites.",
stats.unmapped_templates, stats.zero_site_templates
);
}
}
fn warn_proportion_blind_spot(stats: &Stats, args: &Args) {
if args.quiet || args.mode != DecisionModeCli::Proportion {
return;
}
if stats.below_min_sites_templates > 0 {
eprintln!(
"methylsieve: WARNING proportion mode — {} templates had fewer than {} sites and \
were NOT evaluated (passed through). Use --mode adaptive or either to catch them \
with the count test.",
stats.below_min_sites_templates, args.min_sites
);
}
}
struct StartedRun {
wall_start: Instant,
}
impl StartedRun {
fn now() -> Self {
Self { wall_start: Instant::now() }
}
}
fn report(started: &StartedRun, n_templates: u64, quiet: bool) {
if quiet {
return;
}
let wall = started.wall_start.elapsed().as_secs_f64();
let stderr = std::io::stderr();
let mut stderr = stderr.lock();
#[cfg(unix)]
if let Some(ru) = read_rusage() {
let user = ru.user_secs;
let sys = ru.sys_secs;
let rss_mb = ru.max_rss_bytes as f64 / (1024.0 * 1024.0);
let _ = writeln!(
stderr,
"methylsieve: Processed {n_templates} templates in {wall:.2}s wall, \
{user:.2}s user CPU, {sys:.2}s system CPU, max RSS {rss_mb:.1} MB.",
);
return;
}
let _ = writeln!(stderr, "methylsieve: Processed {n_templates} templates in {wall:.2}s wall.");
}
#[cfg(unix)]
struct Rusage {
user_secs: f64,
sys_secs: f64,
max_rss_bytes: u64,
}
#[cfg(unix)]
fn read_rusage() -> Option<Rusage> {
let mut ru: libc::rusage = unsafe { std::mem::zeroed() };
let rc = unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut ru) };
if rc != 0 {
return None;
}
let user_secs = ru.ru_utime.tv_sec as f64 + ru.ru_utime.tv_usec as f64 * 1e-6;
let sys_secs = ru.ru_stime.tv_sec as f64 + ru.ru_stime.tv_usec as f64 * 1e-6;
let max_rss = ru.ru_maxrss as u64;
#[cfg(target_os = "macos")]
let max_rss_bytes = max_rss;
#[cfg(not(target_os = "macos"))]
let max_rss_bytes = max_rss.saturating_mul(1024);
Some(Rusage { user_secs, sys_secs, max_rss_bytes })
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_contexts_default_is_cph() {
let m = parse_contexts("CpA,CpC,CpT").unwrap();
assert!(m.contains(Context::CpA));
assert!(m.contains(Context::CpC));
assert!(m.contains(Context::CpT));
assert!(!m.contains(Context::CpG));
}
#[test]
fn parse_contexts_accepts_short_spelling_and_case() {
let m = parse_contexts("ca,CG").unwrap();
assert!(m.contains(Context::CpA));
assert!(m.contains(Context::CpG));
assert!(!m.contains(Context::CpC));
}
#[test]
fn parse_contexts_rejects_unknown() {
assert!(parse_contexts("CpA,CpZ").is_err());
assert!(parse_contexts("").is_err());
}
#[test]
fn parse_tag_spec_default() {
let t = parse_tag_spec("XX:Z:UC").unwrap();
assert_eq!(t.tag, [b'X', b'X']);
assert_eq!(t.value, b"UC");
}
#[test]
fn parse_tag_spec_rejects_non_z_and_malformed() {
assert!(parse_tag_spec("XX:i:3").is_err());
assert!(parse_tag_spec("X:Z:UC").is_err());
assert!(parse_tag_spec("XX:Z:").is_err());
assert!(parse_tag_spec("garbage").is_err());
}
#[test]
fn validate_rejects_non_bam_output() {
let mut a = minimal_args();
a.output = Some(PathBuf::from("out.sam"));
assert!(a.validate().is_err());
}
#[test]
fn validate_rejects_default_bam_stdout_plus_stats_stdout() {
let mut a = minimal_args();
a.output = None;
a.stats = Some(PathBuf::from("-"));
let err = a.validate().unwrap_err().to_string();
assert!(err.contains("stdout"), "got: {err}");
}
#[test]
fn validate_rejects_stats_and_matrix_both_to_stdout() {
let mut a = minimal_args();
a.output = Some(PathBuf::from("out.bam")); a.stats = Some(PathBuf::from("-"));
a.conversion_matrix = Some(PathBuf::from("-"));
assert!(a.validate().is_err());
}
#[test]
fn validate_allows_a_single_stdout_stream() {
let mut a = minimal_args();
a.output = Some(PathBuf::from("out.bam"));
a.stats = Some(PathBuf::from("-"));
a.conversion_matrix = Some(PathBuf::from("matrix.tsv"));
assert!(a.validate().is_ok());
}
#[test]
fn validate_rejects_bad_fraction() {
let mut a = minimal_args();
a.max_unconverted_fraction = 1.5;
assert!(a.validate().is_err());
a.max_unconverted_fraction = 0.0;
assert!(a.validate().is_err());
a.max_unconverted_fraction = 0.5;
assert!(a.validate().is_ok());
}
fn minimal_args() -> Args {
Args {
input: None,
output: None,
compression_level: CompressionLevel::new(0).unwrap(),
reference: PathBuf::from("ref.fa"),
ref_encoding: RefEncodingCli::Bytes,
contexts: parse_contexts("CpA,CpC,CpT").unwrap(),
mode: DecisionModeCli::Adaptive,
max_unconverted_count: 3,
max_unconverted_fraction: 0.05,
min_sites: 40,
min_base_quality: 20,
ignore_template_ends: 0,
ignore_supplementary_evidence: false,
tag: parse_tag_spec("XX:Z:UC").unwrap(),
count_tag: [b'c', b'h'],
no_count_tag: false,
no_qc_fail: false,
remove_unconverted: false,
control_contig: vec![],
stats: None,
conversion_matrix: None,
sample: None,
quiet: true,
check_crc: false,
no_check_crc: false,
read_buffer_mb: 16,
write_buffer_mb: 64,
}
}
}