use crate::logging::OperationTimer;
use crate::sam::SamTag;
use crate::validation::validate_file_exists;
use anyhow::{Result, bail};
use bytesize::ByteSize;
use clap::Parser;
use fgumi_bam_io::create_raw_bam_reader;
use fgumi_sort::{
KeyTypesSpec, QuerynameComparator, RawExternalSorter, SortOrder, verify_sort_order,
};
use log::{debug, info};
use std::path::PathBuf;
use crate::commands::command::Command;
use crate::commands::common::{
CompressionOptions, MemoryLimit, MemoryReserve, parse_bool, parse_memory, parse_memory_reserve,
resolve_memory_budget,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortOrderArg {
Coordinate,
Queryname,
QuerynameNatural,
TemplateCoordinate,
}
impl SortOrderArg {
pub fn parse(s: &str) -> Result<Self, String> {
match s {
"coordinate" => Ok(Self::Coordinate),
"queryname" | "queryname::lex" | "queryname::lexicographic" => Ok(Self::Queryname),
"queryname::natural" => Ok(Self::QuerynameNatural),
"template-coordinate" => Ok(Self::TemplateCoordinate),
other => {
if other.starts_with("queryname::") {
let sub =
other.strip_prefix("queryname::").expect("guarded by starts_with check");
Err(format!(
"unknown queryname sub-sort '{sub}', expected 'lex', 'lexicographic', or 'natural'"
))
} else {
Err(format!(
"unknown sort order '{other}', expected 'coordinate', 'queryname', \
'queryname::lex', 'queryname::lexicographic', 'queryname::natural', \
or 'template-coordinate'"
))
}
}
}
}
}
impl From<SortOrderArg> for SortOrder {
fn from(arg: SortOrderArg) -> Self {
match arg {
SortOrderArg::Coordinate => SortOrder::Coordinate,
SortOrderArg::Queryname => SortOrder::Queryname(QuerynameComparator::Lexicographic),
SortOrderArg::QuerynameNatural => SortOrder::Queryname(QuerynameComparator::Natural),
SortOrderArg::TemplateCoordinate => SortOrder::TemplateCoordinate,
}
}
}
#[derive(Debug, Parser)]
#[command(
name = "sort",
about = "\x1b[38;5;72m[ALIGNMENT]\x1b[0m \x1b[36mSort BAM file by coordinate, queryname, or template-coordinate\x1b[0m",
long_about = r#"
Sort a BAM file using high-performance external merge-sort.
This tool provides efficient BAM sorting with support for multiple sort orders:
SORT ORDERS:
coordinate Standard genomic coordinate sort (tid → pos → strand).
Use for IGV visualization, variant calling, `fgumi review`.
queryname Lexicographic read name sort (fast, default sub-sort).
queryname::lex Short alias for lexicographic ordering (same as above).
queryname::lexicographic Explicit lexicographic ordering (same as above).
queryname::natural Natural numeric ordering (samtools-compatible).
Use for `fgumi zipper`, template-level operations.
template-coordinate Template-level position sort for UMI grouping.
Use for `fgumi group`, `fgumi dedup`, and `fgumi downsample` input.
PERFORMANCE:
- 1.9x faster than samtools on template-coordinate sort
- Handles BAM files larger than available RAM via spill-to-disk
- Uses parallel sorting (--threads) for in-memory chunks
- Configurable temp file compression (--temp-compression)
- Default 768M per-thread memory limit (samtools-compatible); pass
`--max-memory auto` to detect system memory (opt-in)
EXAMPLES:
# Sort for fgumi group input
fgumi sort -i aligned.bam -o sorted.bam --order template-coordinate
# Sort by coordinate for IGV
fgumi sort -i input.bam -o sorted.bam --order coordinate
# Sort by queryname for zipper
fgumi sort -i input.bam -o sorted.bam --order queryname
# Multi-threaded sort (default 768M per thread)
fgumi sort -i input.bam -o sorted.bam --order template-coordinate --threads 8
# Override the per-thread memory limit
fgumi sort -i input.bam -o sorted.bam -m 2GiB --threads 8
# Opt in to auto-detected system memory (subtracts --memory-reserve)
fgumi sort -i input.bam -o sorted.bam -m auto --threads 8
# Reserve extra memory for bwa mem running in a pipeline
fgumi sort -i input.bam -o sorted.bam --memory-reserve 12GiB --threads 4
# Verify a BAM file is correctly sorted
fgumi sort -i sorted.bam --verify --order template-coordinate
# Spread spill chunks across multiple temp dirs (round-robin, free-space aware)
fgumi sort -i in.bam -o out.bam -T /mnt/ssd1 -T /mnt/ssd2
# Same via FGUMI_TMP_DIRS env var (PATH-style list)
FGUMI_TMP_DIRS=/mnt/ssd1:/mnt/ssd2 fgumi sort -i in.bam -o out.bam
"#
)]
#[allow(clippy::struct_excessive_bools)]
pub struct Sort {
#[arg(short = 'i', long = "input")]
pub input: PathBuf,
#[arg(short = 'o', long = "output")]
pub output: Option<PathBuf>,
#[arg(long = "verify", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub verify: bool,
#[arg(long = "order", default_value = "template-coordinate", value_parser = SortOrderArg::parse)]
pub order: SortOrderArg,
#[arg(long = "key-types", value_parser = parse_key_types)]
pub key_types: Option<KeyTypesSpec>,
#[arg(short = 'm', long = "max-memory", default_value = "768M", value_parser = parse_memory)]
pub max_memory: MemoryLimit,
#[arg(long = "memory-reserve", default_value = "auto", value_parser = parse_memory_reserve)]
pub memory_reserve: MemoryReserve,
#[arg(long = "memory-per-thread", default_value = "true", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub memory_per_thread: bool,
#[arg(short = 'T', long = "tmp-dir", action = clap::ArgAction::Append)]
pub tmp_dirs: Vec<PathBuf>,
#[arg(short = '@', short_alias = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[arg(long = "temp-compression", default_value = "1", value_parser = clap::value_parser!(u32).range(0..=9))]
pub temp_compression: u32,
#[arg(long = "temp-codec", default_value = "zstd")]
pub temp_codec: fgumi_sort::SpillCodec,
#[arg(long = "write-index", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub write_index: bool,
#[arg(long = "async-reader", default_value_t = false, hide = true)]
pub async_reader: bool,
}
const TMP_DIRS_ENV: &str = "FGUMI_TMP_DIRS";
pub(crate) fn resolve_tmp_dirs(cli: &[PathBuf], env_value: Option<&str>) -> Vec<PathBuf> {
if !cli.is_empty() {
return cli.to_vec();
}
let Some(value) = env_value else { return Vec::new() };
if value.is_empty() {
return Vec::new();
}
std::env::split_paths(value)
.filter(|p| !p.as_os_str().is_empty())
.filter(|p| !p.to_string_lossy().trim().is_empty())
.collect()
}
pub(crate) fn parse_cell_tag(order: SortOrderArg) -> Result<Option<SamTag>> {
if matches!(order, SortOrderArg::TemplateCoordinate) { Ok(Some(SamTag::CB)) } else { Ok(None) }
}
pub(crate) fn parse_key_types(s: &str) -> Result<KeyTypesSpec, String> {
let s = s.trim();
if s.eq_ignore_ascii_case("full") {
return Ok(KeyTypesSpec::Full);
}
if s.is_empty() || s.eq_ignore_ascii_case("none") {
return Ok(KeyTypesSpec::None);
}
let mut cb = false;
let mut tertiary = false;
for tok in s.split([',', ' ']).filter(|t| !t.is_empty()) {
match tok.to_ascii_lowercase().as_str() {
"cb" => cb = true,
"library" | "mi" => tertiary = true,
other => {
return Err(format!(
"unknown --key-types token '{other}', expected 'full', 'none', \
or a list of 'cb','library','mi'"
));
}
}
}
Ok(KeyTypesSpec::Explicit { cb, tertiary })
}
impl Command for Sort {
fn execute(&self, command_line: &str) -> Result<()> {
if self.verify && self.output.is_some() {
bail!("--verify cannot be used with --output");
}
if self.verify && self.write_index {
bail!("--write-index cannot be used with --verify");
}
if fgumi_bam_io::is_stdin_path(&self.input) {
if self.verify {
bail!(
"fgumi sort --verify cannot read from stdin (it re-scans the input); \
provide a file path instead"
);
}
} else {
validate_file_exists(&self.input, "Input BAM")?;
}
if !self.verify && self.output.is_none() {
bail!("Either --output or --verify must be specified");
}
if self.verify {
return self.execute_verify();
}
self.execute_sort(command_line)
}
}
impl Sort {
fn parse_cell_tag(&self) -> Result<Option<SamTag>> {
parse_cell_tag(self.order)
}
fn execute_sort(&self, command_line: &str) -> Result<()> {
let output = self.output.as_ref().expect("output required for sort mode");
if self.write_index && !matches!(self.order, SortOrderArg::Coordinate) {
bail!("--write-index is only valid for coordinate sort");
}
if self.temp_compression == 0 && matches!(self.temp_codec, fgumi_sort::SpillCodec::Zstd) {
bail!(
"--temp-compression 0 is only supported with --temp-codec bgzf; \
zstd does not have an uncompressed mode. Pass --temp-codec bgzf \
to keep level-0 spill, or pick a zstd level >= 1."
);
}
let timer = OperationTimer::new("Sorting BAM");
let effective_memory = resolve_memory_budget(
self.max_memory,
self.memory_reserve,
self.threads,
self.memory_per_thread,
)?;
let cell_tag = self.parse_cell_tag()?;
debug!("Starting Sort");
info!("Input: {}", self.input.display());
info!("Output: {}", output.display());
info!("Sort order: {:?}", self.order);
if let Some(ct) = cell_tag {
let ct_bytes = *ct;
info!("Cell tag: {}{}", ct_bytes[0] as char, ct_bytes[1] as char);
}
if let MemoryLimit::Fixed(per_thread) = self.max_memory {
if self.memory_per_thread {
info!(
"Max memory: {} ({}/thread x {} threads)",
ByteSize(effective_memory as u64),
ByteSize(per_thread as u64),
self.threads
);
} else {
info!("Max memory: {} (fixed)", ByteSize(effective_memory as u64));
}
}
info!("Threads: {}", self.threads);
info!("Temp compression level: {}", self.temp_compression);
if self.write_index {
info!("Write index: enabled");
}
let env_value = std::env::var(TMP_DIRS_ENV).ok();
let resolved_tmp_dirs = resolve_tmp_dirs(&self.tmp_dirs, env_value.as_deref());
if !resolved_tmp_dirs.is_empty() {
let joined = resolved_tmp_dirs
.iter()
.map(|p| p.display().to_string())
.collect::<Vec<_>>()
.join(", ");
info!("Temp directories: {joined}");
}
let mut sorter = RawExternalSorter::new(self.order.into())
.memory_limit(effective_memory)
.threads(self.threads)
.output_compression(self.compression.compression_level)
.temp_compression(self.temp_compression)
.spill_codec(self.temp_codec)
.write_index(self.write_index)
.async_reader(self.async_reader)
.pg_info(crate::version::VERSION.to_string(), command_line.to_string());
if matches!(self.max_memory, MemoryLimit::Auto) {
let init = 768_usize
.checked_mul(1024 * 1024)
.and_then(|b| b.checked_mul(self.threads))
.ok_or_else(|| anyhow::anyhow!("initial auto buffer size overflowed"))?;
sorter = sorter.initial_capacity(effective_memory.min(init));
}
if let Some(ct) = cell_tag {
sorter = sorter.cell_tag(ct);
}
let key_types = self.key_types.unwrap_or_default(); if !matches!(self.order, SortOrderArg::TemplateCoordinate) && self.key_types.is_some() {
info!("--key-types is ignored for --order {:?}", self.order);
}
sorter = sorter.key_types(key_types);
if !resolved_tmp_dirs.is_empty() {
sorter = sorter.temp_dirs(resolved_tmp_dirs);
}
let stats = sorter.sort(&self.input, output)?;
let (total_records, output_records, chunks_written) =
(stats.total_records, stats.output_records, stats.chunks_written);
info!("=== Summary ===");
info!("Records processed: {total_records}");
info!("Records written: {output_records}");
if chunks_written > 0 {
info!("Temporary chunks: {chunks_written}");
}
info!("Output: {}", output.display());
timer.log_completion(total_records);
Ok(())
}
fn execute_verify(&self) -> Result<()> {
use fgumi_sort::RawBamRecordReader;
use fgumi_sort::{
LibraryLookup, RawQuerynameKey, RawQuerynameLexKey, RawSortKey, SortContext, cb_hasher,
extract_coordinate_key_inline, extract_template_key_inline,
};
use std::cmp::Ordering;
use std::fs::File;
let cell_tag = self.parse_cell_tag()?;
let timer = OperationTimer::new("Verifying BAM sort order");
debug!("Starting Sort Verification");
info!("Input: {}", self.input.display());
info!("Expected order: {:?}", self.order);
if let Some(ct) = cell_tag {
let ct_bytes = *ct;
info!("Cell tag: {}{}", ct_bytes[0] as char, ct_bytes[1] as char);
}
let (_, header) = create_raw_bam_reader(&self.input, 1)?;
let file = File::open(&self.input)?;
let mut raw_reader = RawBamRecordReader::new(file)?;
raw_reader.skip_header()?;
let (total_records, violations, first_violation) = match self.order {
SortOrderArg::Coordinate => {
let nref = header.reference_sequences().len() as u32;
verify_sort_order(
raw_reader,
|bam| extract_coordinate_key_inline(bam, nref),
|key, prev| key < prev,
)?
}
SortOrderArg::Queryname => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
raw_reader,
|bam| RawQuerynameLexKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::QuerynameNatural => {
let ctx = SortContext::from_header(&header);
verify_sort_order(
raw_reader,
|bam| RawQuerynameKey::extract(bam, &ctx),
|key, prev| key < prev,
)?
}
SortOrderArg::TemplateCoordinate => {
let lib_lookup = LibraryLookup::from_header(&header);
let hasher = cb_hasher();
verify_sort_order(
raw_reader,
|bam| extract_template_key_inline(bam, &lib_lookup, cell_tag, &hasher),
|key, prev| key.core_cmp(prev) == Ordering::Less,
)?
}
};
info!("=== Verification Summary ===");
info!("Records checked: {total_records}");
info!("Sort order violations: {violations}");
if violations > 0 {
if let Some((record_num, name)) = first_violation {
info!("First violation at record {record_num}: {name}");
}
timer.log_completion(total_records);
bail!(
"BAM file is NOT correctly sorted by {:?}: {violations} violations found",
self.order
);
}
info!("Result: PASS - file is correctly sorted by {:?}", self.order);
timer.log_completion(total_records);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::commands::common::{MIN_MEMORY_PER_THREAD, detect_total_memory, resolve_reserve};
use clap::Parser;
use rstest::rstest;
#[test]
fn test_resolve_tmp_dirs_empty() {
assert!(resolve_tmp_dirs(&[], None).is_empty());
assert!(resolve_tmp_dirs(&[], Some("")).is_empty());
}
#[test]
fn test_resolve_tmp_dirs_cli_only() {
let cli = vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b")];
let got = resolve_tmp_dirs(&cli, None);
assert_eq!(got, cli);
}
#[test]
fn test_resolve_tmp_dirs_env_only() {
#[cfg(unix)]
let env = "/tmp/x:/tmp/y";
#[cfg(windows)]
let env = "C:/tmp/x;C:/tmp/y";
let got = resolve_tmp_dirs(&[], Some(env));
assert_eq!(got.len(), 2);
assert!(got[0].to_string_lossy().ends_with('x'));
assert!(got[1].to_string_lossy().ends_with('y'));
}
#[test]
fn test_resolve_tmp_dirs_cli_overrides_env() {
let cli = vec![PathBuf::from("/tmp/cli")];
#[cfg(unix)]
let env = "/tmp/env1:/tmp/env2";
#[cfg(windows)]
let env = "C:/tmp/env1;C:/tmp/env2";
let got = resolve_tmp_dirs(&cli, Some(env));
assert_eq!(got, cli, "CLI flags must take precedence over env var");
}
#[test]
fn test_resolve_tmp_dirs_skips_empty_segments() {
#[cfg(unix)]
let env = "/tmp/a::/tmp/b:";
#[cfg(windows)]
let env = "C:/tmp/a;;C:/tmp/b;";
let got = resolve_tmp_dirs(&[], Some(env));
assert_eq!(got.len(), 2, "empty path segments must be filtered: {got:?}");
}
#[rstest]
#[case::zero(&[], vec![])]
#[case::single_short(&["-T", "/tmp/a"], vec![PathBuf::from("/tmp/a")])]
#[case::multiple_short(
&["-T", "/tmp/a", "-T", "/tmp/b", "-T", "/tmp/c"],
vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b"), PathBuf::from("/tmp/c")],
)]
#[case::multiple_long(
&["--tmp-dir", "/tmp/a", "--tmp-dir", "/tmp/b"],
vec![PathBuf::from("/tmp/a"), PathBuf::from("/tmp/b")],
)]
fn test_clap_tmp_dir_repeatable(#[case] extra: &[&str], #[case] expected: Vec<PathBuf>) {
let base = ["sort", "-i", "in.bam", "-o", "out.bam", "--order", "coordinate"];
let args: Vec<&str> = base.iter().copied().chain(extra.iter().copied()).collect();
let sort = Sort::try_parse_from(args).expect("parse should succeed");
assert_eq!(sort.tmp_dirs, expected);
}
#[rstest]
#[case::default_omitted(&[], fgumi_sort::SpillCodec::Zstd)]
#[case::explicit_zstd(&["--temp-codec", "zstd"], fgumi_sort::SpillCodec::Zstd)]
#[case::explicit_bgzf(&["--temp-codec", "bgzf"], fgumi_sort::SpillCodec::Bgzf)]
fn test_clap_temp_codec_default(
#[case] extra: &[&str],
#[case] expected: fgumi_sort::SpillCodec,
) {
let base = ["sort", "-i", "in.bam", "-o", "out.bam", "--order", "coordinate"];
let args: Vec<&str> = base.iter().copied().chain(extra.iter().copied()).collect();
let sort = Sort::try_parse_from(args).expect("parse should succeed");
assert_eq!(sort.temp_codec, expected);
}
fn make_sort(order: SortOrderArg) -> Sort {
Sort {
input: PathBuf::from("test.bam"),
output: None,
verify: false,
order,
key_types: None,
max_memory: MemoryLimit::Fixed(512 * 1024 * 1024),
memory_reserve: MemoryReserve::Auto,
memory_per_thread: true,
tmp_dirs: Vec::new(),
threads: 1,
compression: CompressionOptions::default(),
temp_compression: 1,
temp_codec: fgumi_sort::SpillCodec::default(),
write_index: false,
async_reader: false,
}
}
#[rstest]
#[case(SortOrderArg::TemplateCoordinate, Some(SamTag::CB))]
#[case(SortOrderArg::Coordinate, None)]
#[case(SortOrderArg::Queryname, None)]
fn test_parse_cell_tag(#[case] order: SortOrderArg, #[case] expected: Option<SamTag>) {
let sort = make_sort(order);
assert_eq!(sort.parse_cell_tag().expect("parse_cell_tag should succeed"), expected);
}
#[test]
fn test_parse_memory_auto() {
assert_eq!(
parse_memory("auto").expect("parse_memory should succeed for 'auto'"),
MemoryLimit::Auto
);
assert_eq!(
parse_memory("AUTO").expect("parse_memory should succeed for 'AUTO'"),
MemoryLimit::Auto
);
assert_eq!(
parse_memory("Auto").expect("parse_memory should succeed for 'Auto'"),
MemoryLimit::Auto
);
}
#[test]
fn test_parse_memory_plain_numbers_as_mb() {
assert_eq!(
parse_memory("768").expect("parse_memory should succeed for 768"),
MemoryLimit::Fixed(768 * 1024 * 1024)
);
assert_eq!(
parse_memory("1").expect("parse_memory should succeed for 1"),
MemoryLimit::Fixed(1024 * 1024)
);
}
#[test]
fn test_parse_memory_human_readable() {
assert_eq!(
parse_memory("512MB").expect("parse_memory should succeed for 512MB"),
MemoryLimit::Fixed(512 * 1000 * 1000)
);
assert_eq!(
parse_memory("1G").expect("parse_memory should succeed for 1G"),
MemoryLimit::Fixed(1_000_000_000)
);
assert_eq!(
parse_memory("2GB").expect("parse_memory should succeed for 2GB"),
MemoryLimit::Fixed(2_000_000_000)
);
assert_eq!(
parse_memory("1GiB").expect("parse_memory should succeed for 1GiB"),
MemoryLimit::Fixed(1024 * 1024 * 1024)
);
assert_eq!(
parse_memory("512MiB").expect("parse_memory should succeed for 512MiB"),
MemoryLimit::Fixed(512 * 1024 * 1024)
);
}
#[test]
fn test_parse_memory_case_insensitive() {
assert_eq!(
parse_memory("512mb").expect("parse_memory should succeed for lowercase 512mb"),
MemoryLimit::Fixed(512 * 1000 * 1000)
);
assert_eq!(
parse_memory("1gb").expect("parse_memory should succeed for lowercase 1gb"),
MemoryLimit::Fixed(1_000_000_000)
);
}
#[test]
fn test_parse_memory_decimal_with_suffix() {
assert_eq!(
parse_memory("1.5GB").expect("parse_memory should succeed for 1.5GB"),
MemoryLimit::Fixed(1_500_000_000)
);
}
#[test]
fn test_parse_memory_invalid() {
assert!(parse_memory("").is_err());
assert!(parse_memory("abc").is_err());
assert!(parse_memory("-1G").is_err());
}
#[test]
fn test_resolve_memory_limit_fixed() {
let fixed = MemoryLimit::Fixed(1024 * 1024 * 1024); let resolved =
resolve_memory_budget(fixed, MemoryReserve::Auto, 4, true).expect("should succeed");
assert_eq!(resolved, 4 * 1024 * 1024 * 1024);
}
#[test]
fn test_resolve_memory_limit_fixed_no_per_thread() {
let fixed = MemoryLimit::Fixed(4 * 1024 * 1024 * 1024); let resolved =
resolve_memory_budget(fixed, MemoryReserve::Auto, 4, false).expect("should succeed");
assert_eq!(resolved, 4 * 1024 * 1024 * 1024);
}
#[test]
fn test_resolve_memory_limit_auto() {
let total = detect_total_memory();
let resolved = resolve_memory_budget(MemoryLimit::Auto, MemoryReserve::Auto, 4, true)
.expect("should succeed");
let min_expected = MIN_MEMORY_PER_THREAD.saturating_mul(4).min(total);
assert!(
resolved >= min_expected,
"auto resolved to {resolved} bytes, expected at least {min_expected}"
);
if total >= MIN_MEMORY_PER_THREAD.saturating_mul(4) {
assert!(resolved <= total);
}
}
#[test]
fn test_resolve_memory_limit_auto_no_per_thread() {
let resolved = resolve_memory_budget(MemoryLimit::Auto, MemoryReserve::Auto, 8, false)
.expect("should succeed");
assert!(resolved >= 256 * 1024 * 1024);
}
#[test]
fn test_resolve_reserve_auto() {
let gib = 1024 * 1024 * 1024;
assert_eq!(resolve_reserve(MemoryReserve::Auto, 32 * gib), 10 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 16 * gib), 8 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 8 * gib), 4 * gib);
assert_eq!(resolve_reserve(MemoryReserve::Auto, 128 * gib), 10 * gib);
}
#[test]
fn test_resolve_reserve_fixed() {
let gib = 1024 * 1024 * 1024;
assert_eq!(resolve_reserve(MemoryReserve::Fixed(12 * gib), 64 * gib), 12 * gib);
}
#[test]
fn test_parse_memory_reserve() {
assert_eq!(parse_memory_reserve("auto").expect("should parse 'auto'"), MemoryReserve::Auto,);
assert_eq!(
parse_memory_reserve("10GiB").expect("should parse '10GiB'"),
MemoryReserve::Fixed(10 * 1024 * 1024 * 1024),
);
assert_eq!(
parse_memory_reserve("8G").expect("should parse '8G'"),
MemoryReserve::Fixed(8_000_000_000),
);
}
#[test]
fn test_resolve_memory_limit_auto_with_fixed_reserve() {
let large_reserve = resolve_memory_budget(
MemoryLimit::Auto,
MemoryReserve::Fixed(512 * 1024 * 1024),
4,
true,
)
.expect("should succeed");
let small_reserve = resolve_memory_budget(
MemoryLimit::Auto,
MemoryReserve::Fixed(128 * 1024 * 1024),
4,
true,
)
.expect("should succeed");
assert!(large_reserve < small_reserve);
}
#[test]
fn test_sort_order_conversion() {
assert_eq!(SortOrder::from(SortOrderArg::Coordinate), SortOrder::Coordinate);
assert_eq!(
SortOrder::from(SortOrderArg::Queryname),
SortOrder::Queryname(QuerynameComparator::Lexicographic)
);
assert_eq!(
SortOrder::from(SortOrderArg::QuerynameNatural),
SortOrder::Queryname(QuerynameComparator::Natural)
);
assert_eq!(
SortOrder::from(SortOrderArg::TemplateCoordinate),
SortOrder::TemplateCoordinate
);
}
#[rstest]
#[case("coordinate", Ok(SortOrderArg::Coordinate))]
#[case("queryname", Ok(SortOrderArg::Queryname))]
#[case("queryname::lexicographic", Ok(SortOrderArg::Queryname))]
#[case("queryname::lex", Ok(SortOrderArg::Queryname))]
#[case("queryname::natural", Ok(SortOrderArg::QuerynameNatural))]
#[case("template-coordinate", Ok(SortOrderArg::TemplateCoordinate))]
#[case("queryname::fast", Err("unknown queryname sub-sort 'fast'"))]
#[case("random", Err("unknown sort order 'random'"))]
#[case("queryname::", Err("unknown queryname sub-sort ''"))]
fn test_parse_sort_order(#[case] input: &str, #[case] expected: Result<SortOrderArg, &str>) {
match expected {
Ok(order) => assert_eq!(
SortOrderArg::parse(input).expect("parse should succeed for valid sort order"),
order
),
Err(msg) => {
let err = SortOrderArg::parse(input)
.expect_err("parse should fail for invalid sort order");
assert!(err.contains(msg), "expected error containing {msg:?}, got: {err}");
}
}
}
#[test]
fn test_queryname_lex_header_has_subsort() {
let order = SortOrder::from(SortOrderArg::Queryname);
assert_eq!(order.header_so_tag(), "queryname");
assert_eq!(order.header_ss_tag(), Some("lexicographic"));
}
#[test]
fn test_queryname_natural_header_has_subsort() {
let order = SortOrder::from(SortOrderArg::QuerynameNatural);
assert_eq!(order.header_so_tag(), "queryname");
assert_eq!(order.header_ss_tag(), Some("natural"));
}
#[test]
fn test_coordinate_header_no_subsort() {
let order = SortOrder::from(SortOrderArg::Coordinate);
assert_eq!(order.header_so_tag(), "coordinate");
assert_eq!(order.header_ss_tag(), None);
}
#[test]
fn test_template_coordinate_header_subsort() {
let order = SortOrder::from(SortOrderArg::TemplateCoordinate);
assert_eq!(order.header_so_tag(), "unsorted");
assert_eq!(order.header_ss_tag(), Some("template-coordinate"));
}
#[test]
fn test_verify_conflicts_with_output() {
let sort = Sort {
verify: true,
output: Some(PathBuf::from("out.bam")),
..make_sort(SortOrderArg::Coordinate)
};
let err = sort.execute("test").unwrap_err();
assert!(err.to_string().contains("--verify cannot be used with --output"));
}
#[test]
fn test_verify_conflicts_with_write_index() {
let sort = Sort { verify: true, write_index: true, ..make_sort(SortOrderArg::Coordinate) };
let err = sort.execute("test").unwrap_err();
assert!(err.to_string().contains("--write-index cannot be used with --verify"));
}
#[test]
fn test_temp_compression_zero_with_zstd_rejected() {
let sort = Sort {
output: Some(PathBuf::from("out.bam")),
temp_compression: 0,
temp_codec: fgumi_sort::SpillCodec::Zstd,
..make_sort(SortOrderArg::Coordinate)
};
let err = sort.execute_sort("test").unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("--temp-compression 0 is only supported with --temp-codec bgzf"),
"unexpected error: {msg}"
);
}
#[test]
fn test_verify_coordinate_fails_on_unsorted() -> Result<()> {
use fgumi_sort::RawBamRecordReader;
use fgumi_sort::extract_coordinate_key_inline;
let mut builder = crate::sam::builder::SamBuilder::new();
let _ = builder.add_pair().name("a").contig(1).start1(100).build();
let _ = builder.add_pair().name("b").contig(0).start1(200).build();
let dir = tempfile::tempdir()?;
let bam_path = dir.path().join("unsorted.bam");
builder.write_bam(&bam_path)?;
let file = std::fs::File::open(&bam_path)?;
let (_, header) = fgumi_bam_io::create_bam_reader(&bam_path, 1)?;
let mut reader = RawBamRecordReader::new(file)?;
reader.skip_header()?;
let nref = header.reference_sequences().len() as u32;
let (total, violations, _) = verify_sort_order(
reader,
|bam| extract_coordinate_key_inline(bam, nref),
|key, prev| key < prev,
)?;
assert!(total > 0);
assert!(violations > 0, "unsorted file should fail coordinate verify");
Ok(())
}
#[test]
fn test_template_coordinate_sorts_by_cell_barcode() -> Result<()> {
use crate::commands::command::Command;
use crate::sam::builder::SamBuilder;
use bstr::ByteSlice;
let dir = tempfile::tempdir()?;
let input = dir.path().join("input.bam");
let output = dir.path().join("output.bam");
let mut builder = SamBuilder::new();
let _ = builder.add_pair().name("pair_a1").contig(0).start1(100).attr("CB", "A").build();
let _ = builder.add_pair().name("pair_b").contig(0).start1(100).attr("CB", "B").build();
let _ = builder.add_pair().name("pair_a2").contig(0).start1(100).attr("CB", "A").build();
builder.write_bam(&input)?;
let mut sort = make_sort(SortOrderArg::TemplateCoordinate);
sort.input = input;
sort.output = Some(output.clone());
sort.execute("test")?;
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&output)?;
let header = reader.read_header()?;
let records: Vec<_> = reader.record_bufs(&header).collect::<std::io::Result<Vec<_>>>()?;
assert_eq!(records.len(), 6, "should have 6 records (3 pairs × 2 reads)");
let names: Vec<String> = records
.iter()
.map(|r| {
r.name()
.map(|n| String::from_utf8_lossy(n.as_bytes()).into_owned())
.unwrap_or_default()
})
.collect();
let a_positions: Vec<usize> = names
.iter()
.enumerate()
.filter(|(_, n)| n.starts_with("pair_a"))
.map(|(i, _)| i)
.collect();
assert_eq!(a_positions.len(), 4, "expected 4 reads for the two CB=A pairs");
let min = a_positions[0];
let max = *a_positions.last().unwrap();
assert_eq!(
max - min,
3,
"CB=A reads must be grouped together; got positions {a_positions:?}"
);
Ok(())
}
#[rstest]
#[case("full", KeyTypesSpec::Full)]
#[case("none", KeyTypesSpec::None)]
#[case("", KeyTypesSpec::None)]
#[case("cb", KeyTypesSpec::Explicit { cb: true, tertiary: false })]
#[case("library", KeyTypesSpec::Explicit { cb: false, tertiary: true })]
#[case("mi", KeyTypesSpec::Explicit { cb: false, tertiary: true })]
#[case("library,mi", KeyTypesSpec::Explicit { cb: false, tertiary: true })]
#[case("cb,mi", KeyTypesSpec::Explicit { cb: true, tertiary: true })]
#[case("cb library", KeyTypesSpec::Explicit { cb: true, tertiary: true })]
#[case("FULL", KeyTypesSpec::Full)]
#[case("None", KeyTypesSpec::None)]
#[case("CB", KeyTypesSpec::Explicit { cb: true, tertiary: false })]
#[case("Cb,MI", KeyTypesSpec::Explicit { cb: true, tertiary: true })]
fn test_parse_key_types_ok(#[case] input: &str, #[case] expected: KeyTypesSpec) {
assert_eq!(parse_key_types(input).expect("valid"), expected);
}
#[rstest]
#[case("bogus")]
#[case("cb,bogus")]
fn test_parse_key_types_err(#[case] input: &str) {
assert!(parse_key_types(input).is_err());
}
#[test]
fn test_key_types_clap_default_is_none_option() {
let sort = Sort::try_parse_from(["sort", "-i", "in.bam", "-o", "out.bam"]).expect("parse");
assert!(sort.key_types.is_none());
}
}