use crate::bam_io::create_raw_bam_reader;
use crate::logging::OperationTimer;
use crate::sam::SamTag;
use crate::sort::{QuerynameComparator, RawExternalSorter, SortOrder};
use crate::validation::validate_file_exists;
use anyhow::{Result, bail};
use bytesize::ByteSize;
use clap::Parser;
use log::info;
use std::path::PathBuf;
use crate::commands::command::Command;
use crate::commands::common::{CompressionOptions, detect_total_memory, parse_bool};
use crate::validation::parse_memory_size;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortOrderArg {
Coordinate,
Queryname,
QuerynameNatural,
TemplateCoordinate,
}
impl SortOrderArg {
pub fn parse(s: &str) -> Result<Self, String> {
match s {
"coordinate" => Ok(Self::Coordinate),
"queryname" | "queryname::lex" | "queryname::lexicographic" => Ok(Self::Queryname),
"queryname::natural" => Ok(Self::QuerynameNatural),
"template-coordinate" => Ok(Self::TemplateCoordinate),
other => {
if other.starts_with("queryname::") {
let sub =
other.strip_prefix("queryname::").expect("guarded by starts_with check");
Err(format!(
"unknown queryname sub-sort '{sub}', expected 'lex', 'lexicographic', or 'natural'"
))
} else {
Err(format!(
"unknown sort order '{other}', expected 'coordinate', 'queryname', \
'queryname::lex', 'queryname::lexicographic', 'queryname::natural', \
or 'template-coordinate'"
))
}
}
}
}
}
impl From<SortOrderArg> for SortOrder {
fn from(arg: SortOrderArg) -> Self {
match arg {
SortOrderArg::Coordinate => SortOrder::Coordinate,
SortOrderArg::Queryname => SortOrder::Queryname(QuerynameComparator::Lexicographic),
SortOrderArg::QuerynameNatural => SortOrder::Queryname(QuerynameComparator::Natural),
SortOrderArg::TemplateCoordinate => SortOrder::TemplateCoordinate,
}
}
}
#[derive(Debug, Parser)]
#[command(
name = "sort",
about = "\x1b[38;5;72m[ALIGNMENT]\x1b[0m \x1b[36mSort BAM file by coordinate, queryname, or template-coordinate\x1b[0m",
long_about = r#"
Sort a BAM file using high-performance external merge-sort.
This tool provides efficient BAM sorting with support for multiple sort orders:
SORT ORDERS:
coordinate Standard genomic coordinate sort (tid → pos → strand).
Use for IGV visualization, variant calling, `fgumi review`.
queryname Lexicographic read name sort (fast, default sub-sort).
queryname::lex Short alias for lexicographic ordering (same as above).
queryname::lexicographic Explicit lexicographic ordering (same as above).
queryname::natural Natural numeric ordering (samtools-compatible).
Use for `fgumi zipper`, template-level operations.
template-coordinate Template-level position sort for UMI grouping.
Use for `fgumi group`, `fgumi dedup`, and `fgumi downsample` input.
PERFORMANCE:
- 1.9x faster than samtools on template-coordinate sort
- Handles BAM files larger than available RAM via spill-to-disk
- Uses parallel sorting (--threads) for in-memory chunks
- Configurable temp file compression (--temp-compression)
- Default 768M per-thread memory limit (samtools-compatible); pass
`--max-memory auto` to detect system memory (opt-in)
EXAMPLES:
# Sort for fgumi group input
fgumi sort -i aligned.bam -o sorted.bam --order template-coordinate
# Sort by coordinate for IGV
fgumi sort -i input.bam -o sorted.bam --order coordinate
# Sort by queryname for zipper
fgumi sort -i input.bam -o sorted.bam --order queryname
# Multi-threaded sort (default 768M per thread)
fgumi sort -i input.bam -o sorted.bam --order template-coordinate --threads 8
# Override the per-thread memory limit
fgumi sort -i input.bam -o sorted.bam -m 2GiB --threads 8
# Opt in to auto-detected system memory (subtracts --memory-reserve)
fgumi sort -i input.bam -o sorted.bam -m auto --threads 8
# Reserve extra memory for bwa mem running in a pipeline
fgumi sort -i input.bam -o sorted.bam --memory-reserve 12GiB --threads 4
# Verify a BAM file is correctly sorted
fgumi sort -i sorted.bam --verify --order template-coordinate
# Spread spill chunks across multiple temp dirs (round-robin, free-space aware)
fgumi sort -i in.bam -o out.bam -T /mnt/ssd1 -T /mnt/ssd2
# Same via FGUMI_TMP_DIRS env var (PATH-style list)
FGUMI_TMP_DIRS=/mnt/ssd1:/mnt/ssd2 fgumi sort -i in.bam -o out.bam
"#
)]
#[allow(clippy::struct_excessive_bools)]
pub struct Sort {
#[arg(short = 'i', long = "input")]
pub input: PathBuf,
#[arg(short = 'o', long = "output")]
pub output: Option<PathBuf>,
#[arg(long = "verify", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub verify: bool,
#[arg(long = "order", default_value = "template-coordinate", value_parser = SortOrderArg::parse)]
pub order: SortOrderArg,
#[arg(short = 'm', long = "max-memory", default_value = "768M", value_parser = parse_memory)]
pub max_memory: MemoryLimit,
#[arg(long = "memory-reserve", default_value = "auto", value_parser = parse_memory_reserve)]
pub memory_reserve: MemoryReserve,
#[arg(long = "memory-per-thread", default_value = "true", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub memory_per_thread: bool,
#[arg(short = 'T', long = "tmp-dir", action = clap::ArgAction::Append)]
pub tmp_dirs: Vec<PathBuf>,
#[arg(short = '@', short_alias = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[arg(long = "temp-compression", default_value = "1", value_parser = clap::value_parser!(u32).range(0..=9))]
pub temp_compression: u32,
#[arg(long = "write-index", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub write_index: bool,
#[arg(long = "async-reader", default_value_t = false, hide = true)]
pub async_reader: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemoryLimit {
Auto,
Fixed(usize),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemoryReserve {
Auto,
Fixed(usize),
}
fn parse_memory_bytes(s: &str, label: &str) -> Result<usize, String> {
let bytes = parse_memory_size(s).map_err(|e| e.to_string())?;
usize::try_from(bytes).map_err(|_| format!("{label} too large: {bytes}"))
}
pub(crate) fn parse_memory(s: &str) -> Result<MemoryLimit, String> {
let s = s.trim();
if s.eq_ignore_ascii_case("auto") {
return Ok(MemoryLimit::Auto);
}
Ok(MemoryLimit::Fixed(parse_memory_bytes(s, "Memory size")?))
}
const TMP_DIRS_ENV: &str = "FGUMI_TMP_DIRS";
pub(crate) fn resolve_tmp_dirs(cli: &[PathBuf], env_value: Option<&str>) -> Vec<PathBuf> {
if !cli.is_empty() {
return cli.to_vec();
}
let Some(value) = env_value else { return Vec::new() };
if value.is_empty() {
return Vec::new();
}
std::env::split_paths(value)
.filter(|p| !p.as_os_str().is_empty())
.filter(|p| !p.to_string_lossy().trim().is_empty())
.collect()
}
pub(crate) fn parse_memory_reserve(s: &str) -> Result<MemoryReserve, String> {
let s = s.trim();
if s.eq_ignore_ascii_case("auto") {
return Ok(MemoryReserve::Auto);
}
let bytes = parse_memory_bytes(s, "Memory reserve")?;
Ok(MemoryReserve::Fixed(bytes))
}
pub(crate) fn parse_cell_tag(order: SortOrderArg) -> Result<Option<[u8; 2]>> {
if matches!(order, SortOrderArg::TemplateCoordinate) { Ok(Some(*SamTag::CB)) } else { Ok(None) }
}
type VerifySummary = (u64, u64, Option<(u64, String)>);
fn verify_sort_order<K>(
raw_reader: crate::sort::raw_bam_reader::RawBamRecordReader<std::fs::File>,
extract_key: impl Fn(&[u8]) -> K,
is_violation: impl Fn(&K, &K) -> bool,
) -> Result<VerifySummary> {
let mut total_records: u64 = 0;
let mut violations: u64 = 0;
let mut first_violation: Option<(u64, String)> = None;
let mut prev_key: Option<K> = None;
for result in raw_reader {
let record_bytes = result?;
total_records += 1;
let bam: &[u8] = &record_bytes;
let key = extract_key(bam);
if let Some(ref prev) = prev_key {
if is_violation(&key, prev) {
violations += 1;
if first_violation.is_none() {
let name =
String::from_utf8_lossy(fgumi_raw_bam::RawRecordView::new(bam).read_name())
.to_string();
first_violation = Some((total_records, name));
}
}
}
prev_key = Some(key);
}
Ok((total_records, violations, first_violation))
}
impl Command for Sort {
fn execute(&self, command_line: &str) -> Result<()> {
if self.verify && self.output.is_some() {
bail!("--verify cannot be used with --output");
}
if self.verify && self.write_index {
bail!("--write-index cannot be used with --verify");
}
validate_file_exists(&self.input, "Input BAM")?;
if !self.verify && self.output.is_none() {
bail!("Either --output or --verify must be specified");
}
if self.verify {
return self.execute_verify();
}
self.execute_sort(command_line)
}
}
const MIN_MEMORY_PER_THREAD: usize = 256 * 1024 * 1024;
const AUTO_RESERVE_CAP: usize = 10 * 1024 * 1024 * 1024;
fn resolve_reserve(reserve: MemoryReserve, total_memory: usize) -> usize {
match reserve {
MemoryReserve::Fixed(bytes) => bytes,
MemoryReserve::Auto => {
AUTO_RESERVE_CAP.min(total_memory / 2)
}
}
}
fn resolve_memory_limit(
limit: MemoryLimit,
reserve: MemoryReserve,
threads: usize,
memory_per_thread: bool,
) -> Result<usize> {
if threads == 0 {
bail!("--threads must be at least 1");
}
let total = detect_total_memory();
let total_budget = match limit {
MemoryLimit::Fixed(bytes) => {
if memory_per_thread {
bytes
.checked_mul(threads)
.ok_or_else(|| anyhow::anyhow!("--max-memory × --threads overflowed"))?
} else {
bytes
}
}
MemoryLimit::Auto => {
let margin = resolve_reserve(reserve, total);
let available = total.saturating_sub(margin);
if memory_per_thread {
let per_thread = (available / threads).max(MIN_MEMORY_PER_THREAD);
let budget = per_thread
.checked_mul(threads)
.ok_or_else(|| anyhow::anyhow!("auto memory budget overflowed"))?;
if budget > available {
log::warn!(
"Auto memory: total budget {} exceeds available {} \
({}/thread x {} threads, reserve {}); may spill to disk earlier than expected",
ByteSize(budget as u64),
ByteSize(available as u64),
ByteSize(per_thread as u64),
threads,
ByteSize(margin as u64),
);
}
info!(
"Auto memory: using {} of {} ({}/thread x {} threads, reserve {})",
ByteSize(budget as u64),
ByteSize(total as u64),
ByteSize(per_thread as u64),
threads,
ByteSize(margin as u64),
);
budget
} else {
let budget = available.max(MIN_MEMORY_PER_THREAD);
info!(
"Auto memory: using {} of {} (fixed total, reserve {})",
ByteSize(budget as u64),
ByteSize(total as u64),
ByteSize(margin as u64),
);
budget
}
}
};
if total_budget > total {
log::warn!(
"Memory budget {} exceeds total system memory {}; spill-to-disk is likely",
ByteSize(total_budget as u64),
ByteSize(total as u64),
);
}
Ok(total_budget)
}
impl Sort {
fn parse_cell_tag(&self) -> Result<Option<[u8; 2]>> {
parse_cell_tag(self.order)
}
fn execute_sort(&self, command_line: &str) -> Result<()> {
let output = self.output.as_ref().expect("output required for sort mode");
if self.write_index && !matches!(self.order, SortOrderArg::Coordinate) {
bail!("--write-index is only valid for coordinate sort");
}
let timer = OperationTimer::new("Sorting BAM");
let effective_memory = resolve_memory_limit(
self.max_memory,
self.memory_reserve,
self.threads,
self.memory_per_thread,
)?;
let cell_tag = self.parse_cell_tag()?;
info!("Starting Sort");
info!("Input: {}", self.input.display());
info!("Output: {}", output.display());
info!("Sort order: {:?}", self.order);
if let Some(ct) = cell_tag {
info!("Cell tag: {}{}", ct[0] as char, ct[1] as char);
}
if let MemoryLimit::Fixed(per_thread) = self.max_memory {
if self.memory_per_thread {
info!(
"Max memory: {} ({}/thread x {} threads)",
ByteSize(effective_memory as u64),
ByteSize(per_thread as u64),
self.threads
);
} else {
info!("Max memory: {} (fixed)", ByteSize(effective_memory as u64));
}
}
info!("Threads: {}", self.threads);
info!("Temp compression level: {}", self.temp_compression);
if self.write_index {
info!("Write index: enabled");
}
let env_value = std::env::var(TMP_DIRS_ENV).ok();
let resolved_tmp_dirs = resolve_tmp_dirs(&self.tmp_dirs, env_value.as_deref());
if !resolved_tmp_dirs.is_empty() {
let joined = resolved_tmp_dirs
.iter()
.map(|p| p.display().to_string())
.collect::<Vec<_>>()
.join(", ");
info!("Temp directories: {joined}");
}
let mut sorter = RawExternalSorter::new(self.order.into())
.memory_limit(effective_memory)
.threads(self.threads)
.output_compression(self.compression.compression_level)
.temp_compression(self.temp_compression)
.write_index(self.write_index)
.async_reader(self.async_reader)
.pg_info(crate::version::VERSION.to_string(), command_line.to_string());
if matches!(self.max_memory, MemoryLimit::Auto) {
let init = 768_usize
.checked_mul(1024 * 1024)
.and_then(|b| b.checked_mul(self.threads))
.ok_or_else(|| anyhow::anyhow!("initial auto buffer size overflowed"))?;
sorter = sorter.initial_capacity(effective_memory.min(init));
}
if let Some(ct) = cell_tag {
sorter = sorter.cell_tag(ct);
}
if !resolved_tmp_dirs.is_empty() {
sorter = sorter.temp_dirs(resolved_tmp_dirs);
}
let stats = sorter.sort(&self.input, output)?;
let (total_records, output_records, chunks_written) =
(stats.total_records, stats.output_records, stats.chunks_written);
info!("=== Summary ===");
info!("Records processed: {total_records}");
info!("Records written: {output_records}");
if chunks_written > 0 {
info!("Temporary chunks: {chunks_written}");
}
info!("Output: {}", output.display());
timer.log_completion(total_records);
Ok(())
}
fn execute_verify(&self) -> Result<()> {
use crate::sort::raw_bam_reader::RawBamRecordReader;
use crate::sort::{
LibraryLookup, RawQuerynameKey, RawQuerynameLexKey, RawSortKey, SortContext, cb_hasher,
extract_coordinate_key_inline, extract_template_key_inline,
};
use std::cmp::Ordering;
use std::fs::File;
let cell_tag = self.parse_cell_tag()?;
let timer = OperationTimer::new("Verifying BAM sort order");
info!("Starting Sort Verification");
info!("Input: {}", self.input.display());
info!("Expected order: {:?}", self.order);
if let Some(ct) = cell_tag {
info!("Cell tag: {}{}", ct[0] as char, ct[1] as char);
}
let (_, header) = create_raw_bam_reader(&self.input, 1)?;
let file = File::open(&self.input)?;
let mut raw_reader = RawBamRecordReader::new(file)?;
raw_reader.skip_header()?;
let (total_records, violations, first_violation) = match self.order {
SortOrderArg::Coordinate => {
let nref = header.reference_sequences().len() as u32;
verify_sort_order(
raw_reader,
|bam| extract_coordinate_key_inline(bam, nref),
|key, prev| key < prev,
)?
}
SortOrderArg::Queryname => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
raw_reader,
|bam| RawQuerynameLexKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::QuerynameNatural => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
raw_reader,
|bam| RawQuerynameKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::TemplateCoordinate => {
let lib_lookup = LibraryLookup::from_header(&header);
let hasher = cb_hasher();
verify_sort_order(
raw_reader,
|bam| extract_template_key_inline(bam, &lib_lookup, cell_tag.as_ref(), &hasher),
|key, prev| key.core_cmp(prev) == Ordering::Less,
)?
}
};
info!("=== Verification Summary ===");
info!("Records checked: {total_records}");
info!("Sort order violations: {violations}");
if violations > 0 {
if let Some((record_num, name)) = first_violation {
info!("First violation at record {record_num}: {name}");
}
timer.log_completion(total_records);
bail!(
"BAM file is NOT correctly sorted by {:?}: {violations} violations found",
self.order
);
}
info!("Result: PASS - file is correctly sorted by {:?}", self.order);
timer.log_completion(total_records);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use clap::Parser;
use rstest::rstest;
#[test]
fn test_resolve_tmp_dirs_empty() {
assert!(resolve_tmp_dirs(&[], None).is_empty());
assert!(resolve_tmp_dirs(&[], Some("")).is_empty());
}
#[test]
fn test_resolve_tmp_dirs_cli_only() {
let cli = vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b")];
let got = resolve_tmp_dirs(&cli, None);
assert_eq!(got, cli);
}
#[test]
fn test_resolve_tmp_dirs_env_only() {
#[cfg(unix)]
let env = "/tmp/x:/tmp/y";
#[cfg(windows)]
let env = "C:/tmp/x;C:/tmp/y";
let got = resolve_tmp_dirs(&[], Some(env));
assert_eq!(got.len(), 2);
assert!(got[0].to_string_lossy().ends_with('x'));
assert!(got[1].to_string_lossy().ends_with('y'));
}
#[test]
fn test_resolve_tmp_dirs_cli_overrides_env() {
let cli = vec![PathBuf::from("/tmp/cli")];
#[cfg(unix)]
let env = "/tmp/env1:/tmp/env2";
#[cfg(windows)]
let env = "C:/tmp/env1;C:/tmp/env2";
let got = resolve_tmp_dirs(&cli, Some(env));
assert_eq!(got, cli, "CLI flags must take precedence over env var");
}
#[test]
fn test_resolve_tmp_dirs_skips_empty_segments() {
#[cfg(unix)]
let env = "/tmp/a::/tmp/b:";
#[cfg(windows)]
let env = "C:/tmp/a;;C:/tmp/b;";
let got = resolve_tmp_dirs(&[], Some(env));
assert_eq!(got.len(), 2, "empty path segments must be filtered: {got:?}");
}
#[rstest]
#[case::zero(&[], vec![])]
#[case::single_short(&["-T", "/tmp/a"], vec![PathBuf::from("/tmp/a")])]
#[case::multiple_short(
&["-T", "/tmp/a", "-T", "/tmp/b", "-T", "/tmp/c"],
vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b"), PathBuf::from("/tmp/c")],
)]
#[case::multiple_long(
&["--tmp-dir", "/tmp/a", "--tmp-dir", "/tmp/b"],
vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b")],
)]
fn test_clap_tmp_dir_repeatable(#[case] extra: &[&str], #[case] expected: Vec<PathBuf>) {
let base = ["sort", "-i", "in.bam", "-o", "out.bam", "--order", "coordinate"];
let args: Vec<&str> = base.iter().copied().chain(extra.iter().copied()).collect();
let sort = Sort::try_parse_from(args).expect("parse should succeed");
assert_eq!(sort.tmp_dirs, expected);
}
fn make_sort(order: SortOrderArg) -> Sort {
Sort {
input: PathBuf::from("test.bam"),
output: None,
verify: false,
order,
max_memory: MemoryLimit::Fixed(512 * 1024 * 1024),
memory_reserve: MemoryReserve::Auto,
memory_per_thread: true,
tmp_dirs: Vec::new(),
threads: 1,
compression: CompressionOptions::default(),
temp_compression: 1,
write_index: false,
async_reader: false,
}
}
#[rstest]
#[case(SortOrderArg::TemplateCoordinate, Some(*SamTag::CB))]
#[case(SortOrderArg::Coordinate, None)]
#[case(SortOrderArg::Queryname, None)]
fn test_parse_cell_tag(#[case] order: SortOrderArg, #[case] expected: Option<[u8; 2]>) {
let sort = make_sort(order);
assert_eq!(sort.parse_cell_tag().expect("parse_cell_tag should succeed"), expected);
}
#[test]
fn test_parse_memory_auto() {
assert_eq!(
parse_memory("auto").expect("parse_memory should succeed for 'auto'"),
MemoryLimit::Auto
);
assert_eq!(
parse_memory("AUTO").expect("parse_memory should succeed for 'AUTO'"),
MemoryLimit::Auto
);
assert_eq!(
parse_memory("Auto").expect("parse_memory should succeed for 'Auto'"),
MemoryLimit::Auto
);
}
#[test]
fn test_parse_memory_plain_numbers_as_mb() {
assert_eq!(
parse_memory("768").expect("parse_memory should succeed for 768"),
MemoryLimit::Fixed(768 * 1024 * 1024)
);
assert_eq!(
parse_memory("1").expect("parse_memory should succeed for 1"),
MemoryLimit::Fixed(1024 * 1024)
);
}
#[test]
fn test_parse_memory_human_readable() {
assert_eq!(
parse_memory("512MB").expect("parse_memory should succeed for 512MB"),
MemoryLimit::Fixed(512 * 1000 * 1000)
);
assert_eq!(
parse_memory("1G").expect("parse_memory should succeed for 1G"),
MemoryLimit::Fixed(1_000_000_000)
);
assert_eq!(
parse_memory("2GB").expect("parse_memory should succeed for 2GB"),
MemoryLimit::Fixed(2_000_000_000)
);
assert_eq!(
parse_memory("1GiB").expect("parse_memory should succeed for 1GiB"),
MemoryLimit::Fixed(1024 * 1024 * 1024)
);
assert_eq!(
parse_memory("512MiB").expect("parse_memory should succeed for 512MiB"),
MemoryLimit::Fixed(512 * 1024 * 1024)
);
}
#[test]
fn test_parse_memory_case_insensitive() {
assert_eq!(
parse_memory("512mb").expect("parse_memory should succeed for lowercase 512mb"),
MemoryLimit::Fixed(512 * 1000 * 1000)
);
assert_eq!(
parse_memory("1gb").expect("parse_memory should succeed for lowercase 1gb"),
MemoryLimit::Fixed(1_000_000_000)
);
}
#[test]
fn test_parse_memory_decimal_with_suffix() {
assert_eq!(
parse_memory("1.5GB").expect("parse_memory should succeed for 1.5GB"),
MemoryLimit::Fixed(1_500_000_000)
);
}
#[test]
fn test_parse_memory_invalid() {
assert!(parse_memory("").is_err());
assert!(parse_memory("abc").is_err());
assert!(parse_memory("-1G").is_err());
}
#[test]
fn test_resolve_memory_limit_fixed() {
let fixed = MemoryLimit::Fixed(1024 * 1024 * 1024); let resolved =
resolve_memory_limit(fixed, MemoryReserve::Auto, 4, true).expect("should succeed");
assert_eq!(resolved, 4 * 1024 * 1024 * 1024);
}
#[test]
fn test_resolve_memory_limit_fixed_no_per_thread() {
let fixed = MemoryLimit::Fixed(4 * 1024 * 1024 * 1024); let resolved =
resolve_memory_limit(fixed, MemoryReserve::Auto, 4, false).expect("should succeed");
assert_eq!(resolved, 4 * 1024 * 1024 * 1024);
}
#[test]
fn test_resolve_memory_limit_auto() {
let total = detect_total_memory();
let resolved = resolve_memory_limit(MemoryLimit::Auto, MemoryReserve::Auto, 4, true)
.expect("should succeed");
let min_expected = MIN_MEMORY_PER_THREAD.saturating_mul(4).min(total);
assert!(
resolved >= min_expected,
"auto resolved to {resolved} bytes, expected at least {min_expected}"
);
if total >= MIN_MEMORY_PER_THREAD.saturating_mul(4) {
assert!(resolved <= total);
}
}
#[test]
fn test_resolve_memory_limit_auto_no_per_thread() {
let resolved = resolve_memory_limit(MemoryLimit::Auto, MemoryReserve::Auto, 8, false)
.expect("should succeed");
assert!(resolved >= 256 * 1024 * 1024);
}
#[test]
fn test_resolve_reserve_auto() {
let gib = 1024 * 1024 * 1024;
assert_eq!(resolve_reserve(MemoryReserve::Auto, 32 * gib), 10 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 16 * gib), 8 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 8 * gib), 4 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 128 * gib), 10 * gib);
}
#[test]
fn test_resolve_reserve_fixed() {
let gib = 1024 * 1024 * 1024;
assert_eq!(resolve_reserve(MemoryReserve::Fixed(12 * gib), 64 * gib), 12 * gib);
}
#[test]
fn test_parse_memory_reserve() {
assert_eq!(parse_memory_reserve("auto").expect("should parse 'auto'"), MemoryReserve::Auto,);
assert_eq!(
parse_memory_reserve("10GiB").expect("should parse '10GiB'"),
MemoryReserve::Fixed(10 * 1024 * 1024 * 1024),
);
assert_eq!(
parse_memory_reserve("8G").expect("should parse '8G'"),
MemoryReserve::Fixed(8_000_000_000),
);
}
#[test]
fn test_resolve_memory_limit_auto_with_fixed_reserve() {
let large_reserve = resolve_memory_limit(
MemoryLimit::Auto,
MemoryReserve::Fixed(512 * 1024 * 1024),
4,
true,
)
.expect("should succeed");
let small_reserve = resolve_memory_limit(
MemoryLimit::Auto,
MemoryReserve::Fixed(128 * 1024 * 1024),
4,
true,
)
.expect("should succeed");
assert!(large_reserve < small_reserve);
}
#[test]
fn test_sort_order_conversion() {
assert_eq!(SortOrder::from(SortOrderArg::Coordinate), SortOrder::Coordinate);
assert_eq!(
SortOrder::from(SortOrderArg::Queryname),
SortOrder::Queryname(QuerynameComparator::Lexicographic)
);
assert_eq!(
SortOrder::from(SortOrderArg::QuerynameNatural),
SortOrder::Queryname(QuerynameComparator::Natural)
);
assert_eq!(
SortOrder::from(SortOrderArg::TemplateCoordinate),
SortOrder::TemplateCoordinate
);
}
#[rstest]
#[case("coordinate", Ok(SortOrderArg::Coordinate))]
#[case("queryname", Ok(SortOrderArg::Queryname))]
#[case("queryname::lexicographic", Ok(SortOrderArg::Queryname))]
#[case("queryname::lex", Ok(SortOrderArg::Queryname))]
#[case("queryname::natural", Ok(SortOrderArg::QuerynameNatural))]
#[case("template-coordinate", Ok(SortOrderArg::TemplateCoordinate))]
#[case("queryname::fast", Err("unknown queryname sub-sort 'fast'"))]
#[case("random", Err("unknown sort order 'random'"))]
#[case("queryname::", Err("unknown queryname sub-sort ''"))]
fn test_parse_sort_order(#[case] input: &str, #[case] expected: Result<SortOrderArg, &str>) {
match expected {
Ok(order) => assert_eq!(
SortOrderArg::parse(input).expect("parse should succeed for valid sort order"),
order
),
Err(msg) => {
let err = SortOrderArg::parse(input)
.expect_err("parse should fail for invalid sort order");
assert!(err.contains(msg), "expected error containing {msg:?}, got: {err}");
}
}
}
#[test]
fn test_queryname_lex_header_has_subsort() {
let order = SortOrder::from(SortOrderArg::Queryname);
assert_eq!(order.header_so_tag(), "queryname");
assert_eq!(order.header_ss_tag(), Some("lexicographic"));
}
#[test]
fn test_queryname_natural_header_has_subsort() {
let order = SortOrder::from(SortOrderArg::QuerynameNatural);
assert_eq!(order.header_so_tag(), "queryname");
assert_eq!(order.header_ss_tag(), Some("natural"));
}
#[test]
fn test_coordinate_header_no_subsort() {
let order = SortOrder::from(SortOrderArg::Coordinate);
assert_eq!(order.header_so_tag(), "coordinate");
assert_eq!(order.header_ss_tag(), None);
}
#[test]
fn test_template_coordinate_header_subsort() {
let order = SortOrder::from(SortOrderArg::TemplateCoordinate);
assert_eq!(order.header_so_tag(), "unsorted");
assert_eq!(order.header_ss_tag(), Some("template-coordinate"));
}
#[test]
fn test_verify_sort_order_sorted() -> Result<()> {
use crate::sam::builder::SamBuilder;
use crate::sort::raw_bam_reader::RawBamRecordReader;
let mut builder = SamBuilder::new();
let _ = builder.add_pair().name("aaa").build();
let _ = builder.add_pair().name("bbb").build();
let _ = builder.add_pair().name("ccc").build();
let dir = tempfile::tempdir()?;
let bam_path = dir.path().join("sorted.bam");
builder.write_bam(&bam_path)?;
let file = std::fs::File::open(&bam_path)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let (total, violations, first_violation) = verify_sort_order(
reader,
|bam| fgumi_raw_bam::RawRecordView::new(bam).read_name().to_vec(),
|key, prev| key < prev,
)?;
assert_eq!(total, 6); assert_eq!(violations, 0);
assert!(first_violation.is_none());
Ok(())
}
#[test]
fn test_verify_sort_order_unsorted() -> Result<()> {
use crate::sam::builder::SamBuilder;
use crate::sort::raw_bam_reader::RawBamRecordReader;
let mut builder = SamBuilder::new();
let _ = builder.add_pair().name("ccc").build();
let _ = builder.add_pair().name("aaa").build();
let _ = builder.add_pair().name("bbb").build();
let dir = tempfile::tempdir()?;
let bam_path = dir.path().join("unsorted.bam");
builder.write_bam(&bam_path)?;
let file = std::fs::File::open(&bam_path)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let (total, violations, first_violation) = verify_sort_order(
reader,
|bam| fgumi_raw_bam::RawRecordView::new(bam).read_name().to_vec(),
|key, prev| key < prev,
)?;
assert_eq!(total, 6);
assert!(violations > 0);
assert!(first_violation.is_some());
let (record_num, name) =
first_violation.expect("first violation should be present for unsorted file");
assert!(record_num > 1); assert!(!name.is_empty());
Ok(())
}
#[test]
fn test_verify_sort_order_empty() -> Result<()> {
use crate::sam::builder::SamBuilder;
use crate::sort::raw_bam_reader::RawBamRecordReader;
let builder = SamBuilder::new();
let dir = tempfile::tempdir()?;
let bam_path = dir.path().join("empty.bam");
builder.write_bam(&bam_path)?;
let file = std::fs::File::open(&bam_path)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let (total, violations, first_violation) = verify_sort_order(
reader,
|bam| fgumi_raw_bam::RawRecordView::new(bam).read_name().to_vec(),
|key, prev| key < prev,
)?;
assert_eq!(total, 0);
assert_eq!(violations, 0);
assert!(first_violation.is_none());
Ok(())
}
fn sort_and_verify_pass(order_str: &str) -> Result<()> {
use crate::sort::raw_bam_reader::RawBamRecordReader;
use crate::sort::{
RawExternalSorter, RawQuerynameKey, RawQuerynameLexKey, RawSortKey, SortContext,
extract_coordinate_key_inline,
};
let mut builder = crate::sam::builder::SamBuilder::new();
let _ = builder.add_pair().name("read2").contig(0).start1(200).build();
let _ = builder.add_pair().name("read10").contig(0).start1(100).build();
let _ = builder.add_pair().name("read1").contig(1).start1(50).build();
let dir = tempfile::tempdir()?;
let input_bam = dir.path().join("input.bam");
let sorted_bam = dir.path().join("sorted.bam");
builder.write_bam(&input_bam)?;
let order_arg =
SortOrderArg::parse(order_str).expect("parse should succeed for valid sort order");
let sort_order: SortOrder = order_arg.into();
let sorter = RawExternalSorter::new(sort_order).threads(1).output_compression(6);
sorter.sort(&input_bam, &sorted_bam)?;
let file = std::fs::File::open(&sorted_bam)?;
let (_, header) = crate::bam_io::create_bam_reader(&sorted_bam, 1)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let (total, violations, _) = match order_arg {
SortOrderArg::Coordinate => {
let nref = header.reference_sequences().len() as u32;
verify_sort_order(
reader,
|bam| extract_coordinate_key_inline(bam, nref),
|key, prev| key < prev,
)?
}
SortOrderArg::Queryname => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
reader,
|bam| RawQuerynameLexKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::QuerynameNatural => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
reader,
|bam| RawQuerynameKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::TemplateCoordinate => {
return Ok(());
}
};
assert!(total > 0, "should have records for {order_str}");
assert_eq!(violations, 0, "should be sorted for {order_str}");
Ok(())
}
#[test]
fn test_verify_coordinate_sorted_pass() -> Result<()> {
sort_and_verify_pass("coordinate")
}
#[test]
fn test_verify_queryname_default_sorted_pass() -> Result<()> {
sort_and_verify_pass("queryname")
}
#[test]
fn test_verify_queryname_lexicographic_sorted_pass() -> Result<()> {
sort_and_verify_pass("queryname::lexicographic")
}
#[test]
fn test_verify_queryname_natural_sorted_pass() -> Result<()> {
sort_and_verify_pass("queryname::natural")
}
#[test]
fn test_verify_queryname_lex_fails_with_natural_verifier() -> Result<()> {
use crate::sort::raw_bam_reader::RawBamRecordReader;
use crate::sort::{RawExternalSorter, RawQuerynameKey, RawSortKey, SortContext};
let mut builder = crate::sam::builder::SamBuilder::new();
let _ = builder.add_pair().name("read2").contig(0).start1(100).build();
let _ = builder.add_pair().name("read10").contig(0).start1(200).build();
let dir = tempfile::tempdir()?;
let input_bam = dir.path().join("input.bam");
let sorted_bam = dir.path().join("sorted.bam");
builder.write_bam(&input_bam)?;
let sorter =
RawExternalSorter::new(SortOrder::Queryname(QuerynameComparator::Lexicographic))
.threads(1)
.output_compression(6);
sorter.sort(&input_bam, &sorted_bam)?;
let file = std::fs::File::open(&sorted_bam)?;
let (_, header) = crate::bam_io::create_bam_reader(&sorted_bam, 1)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let ctx = SortContext::from_header(&header);
let (total, violations, _) = verify_sort_order(
reader,
|bam| RawQuerynameKey::extract(bam, &ctx),
|key, prev| key < prev,
)?;
assert!(total > 0);
assert!(violations > 0, "lex-sorted file should fail natural verify");
Ok(())
}
#[test]
fn test_verify_queryname_natural_fails_with_lex_verifier() -> Result<()> {
use crate::sort::raw_bam_reader::RawBamRecordReader;
use crate::sort::{RawExternalSorter, RawQuerynameLexKey, RawSortKey, SortContext};
let mut builder = crate::sam::builder::SamBuilder::new();
let _ = builder.add_pair().name("read2").contig(0).start1(100).build();
let _ = builder.add_pair().name("read10").contig(0).start1(200).build();
let dir = tempfile::tempdir()?;
let input_bam = dir.path().join("input.bam");
let sorted_bam = dir.path().join("sorted.bam");
builder.write_bam(&input_bam)?;
let sorter = RawExternalSorter::new(SortOrder::Queryname(QuerynameComparator::Natural))
.threads(1)
.output_compression(6);
sorter.sort(&input_bam, &sorted_bam)?;
let file = std::fs::File::open(&sorted_bam)?;
let (_, header) = crate::bam_io::create_bam_reader(&sorted_bam, 1)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let ctx = SortContext::from_header(&header);
let (total, violations, _) = verify_sort_order(
reader,
|bam| RawQuerynameLexKey::extract(bam, &ctx),
|key, prev| key < prev,
)?;
assert!(total > 0);
assert!(violations > 0, "natural-sorted file should fail lex verify");
Ok(())
}
#[test]
fn test_verify_conflicts_with_output() {
let sort = Sort {
verify: true,
output: Some(PathBuf::from("out.bam")),
..make_sort(SortOrderArg::Coordinate)
};
let err = sort.execute("test").unwrap_err();
assert!(err.to_string().contains("--verify cannot be used with --output"));
}
#[test]
fn test_verify_conflicts_with_write_index() {
let sort = Sort { verify: true, write_index: true, ..make_sort(SortOrderArg::Coordinate) };
let err = sort.execute("test").unwrap_err();
assert!(err.to_string().contains("--write-index cannot be used with --verify"));
}
#[test]
fn test_verify_coordinate_fails_on_unsorted() -> Result<()> {
use crate::sort::extract_coordinate_key_inline;
use crate::sort::raw_bam_reader::RawBamRecordReader;
let mut builder = crate::sam::builder::SamBuilder::new();
let _ = builder.add_pair().name("a").contig(1).start1(100).build();
let _ = builder.add_pair().name("b").contig(0).start1(200).build();
let dir = tempfile::tempdir()?;
let bam_path = dir.path().join("unsorted.bam");
builder.write_bam(&bam_path)?;
let file = std::fs::File::open(&bam_path)?;
let (_, header) = crate::bam_io::create_bam_reader(&bam_path, 1)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let nref = header.reference_sequences().len() as u32;
let (total, violations, _) = verify_sort_order(
reader,
|bam| extract_coordinate_key_inline(bam, nref),
|key, prev| key < prev,
)?;
assert!(total > 0);
assert!(violations > 0, "unsorted file should fail coordinate verify");
Ok(())
}
#[test]
fn test_template_coordinate_sorts_by_cell_barcode() -> Result<()> {
use crate::commands::command::Command;
use crate::sam::builder::SamBuilder;
use bstr::ByteSlice;
let dir = tempfile::tempdir()?;
let input = dir.path().join("input.bam");
let output = dir.path().join("output.bam");
let mut builder = SamBuilder::new();
let _ = builder.add_pair().name("pair_a1").contig(0).start1(100).attr("CB", "A").build();
let _ = builder.add_pair().name("pair_b").contig(0).start1(100).attr("CB", "B").build();
let _ = builder.add_pair().name("pair_a2").contig(0).start1(100).attr("CB", "A").build();
builder.write_bam(&input)?;
let mut sort = make_sort(SortOrderArg::TemplateCoordinate);
sort.input = input;
sort.output = Some(output.clone());
sort.execute("test")?;
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&output)?;
let header = reader.read_header()?;
let records: Vec<_> = reader.record_bufs(&header).collect::<std::io::Result<Vec<_>>>()?;
assert_eq!(records.len(), 6, "should have 6 records (3 pairs × 2 reads)");
let names: Vec<String> = records
.iter()
.map(|r| {
r.name()
.map(|n| String::from_utf8_lossy(n.as_bytes()).into_owned())
.unwrap_or_default()
})
.collect();
let a_positions: Vec<usize> = names
.iter()
.enumerate()
.filter(|(_, n)| n.starts_with("pair_a"))
.map(|(i, _)| i)
.collect();
assert_eq!(a_positions.len(), 4, "expected 4 reads for the two CB=A pairs");
let min = a_positions[0];
let max = *a_positions.last().unwrap();
assert_eq!(
max - min,
3,
"CB=A reads must be grouped together; got positions {a_positions:?}"
);
Ok(())
}
}