use noodles::sam::Header;
use std::cmp::Ordering;
use crate::sort::bam_fields;
use fgumi_raw_bam::RawRecordView;
use std::io::{Read, Write};
pub trait RawSortKey: Ord + Clone + Send + Sync + Sized {
const SERIALIZED_SIZE: Option<usize>;
const EMBEDDED_IN_RECORD: bool = false;
fn extract(bam: &[u8], ctx: &SortContext) -> Self;
#[must_use]
fn extract_from_record(_bam: &[u8]) -> Self {
unimplemented!("extract_from_record only valid when EMBEDDED_IN_RECORD is true")
}
fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()>;
fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self>;
}
#[derive(Clone)]
pub struct SortContext {
pub nref: u32,
}
impl SortContext {
#[must_use]
#[allow(clippy::cast_possible_truncation)]
pub fn from_header(header: &Header) -> Self {
Self { nref: header.reference_sequences().len() as u32 }
}
#[must_use]
pub fn new(nref: u32) -> Self {
Self { nref }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum QuerynameComparator {
#[default]
Lexicographic,
Natural,
}
impl std::fmt::Display for QuerynameComparator {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Lexicographic => write!(f, "lexicographic"),
Self::Natural => write!(f, "natural"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortOrder {
Coordinate,
Queryname(QuerynameComparator),
TemplateCoordinate,
}
impl SortOrder {
#[must_use]
pub fn is_queryname(&self) -> bool {
matches!(self, Self::Queryname(_))
}
#[must_use]
pub fn queryname_comparator(&self) -> Option<QuerynameComparator> {
match self {
Self::Queryname(cmp) => Some(*cmp),
_ => None,
}
}
#[must_use]
pub fn header_so_tag(&self) -> &'static str {
match self {
Self::Coordinate => "coordinate",
Self::Queryname(_) => "queryname",
Self::TemplateCoordinate => "unsorted",
}
}
#[must_use]
pub fn header_go_tag(&self) -> Option<&'static str> {
match self {
Self::Coordinate | Self::Queryname(_) => None,
Self::TemplateCoordinate => Some("query"),
}
}
#[must_use]
pub fn header_ss_tag(&self) -> Option<&'static str> {
match self {
Self::Coordinate => None,
Self::Queryname(QuerynameComparator::Lexicographic) => Some("lexicographic"),
Self::Queryname(QuerynameComparator::Natural) => Some("natural"),
Self::TemplateCoordinate => Some("template-coordinate"),
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Eq, PartialEq, Debug, bytemuck::Pod, bytemuck::Zeroable)]
pub struct RawCoordinateKey {
pub sort_key: u64,
}
impl RawCoordinateKey {
pub const SIZE: usize = 8;
#[inline]
#[must_use]
#[allow(clippy::cast_sign_loss)]
pub fn new(tid: i32, pos: i32, reverse: bool, nref: u32) -> Self {
let tid = if tid < 0 { nref } else { tid as u32 };
let key = (u64::from(tid) << 34)
| ((i64::from(pos) as u64).wrapping_add(1) << 1)
| u64::from(reverse);
Self { sort_key: key }
}
#[inline]
#[must_use]
pub fn unmapped() -> Self {
Self { sort_key: u64::MAX }
}
#[inline]
#[must_use]
pub fn zeroed() -> Self {
Self { sort_key: 0 }
}
}
impl Default for RawCoordinateKey {
fn default() -> Self {
Self::zeroed()
}
}
impl Ord for RawCoordinateKey {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
self.sort_key.cmp(&other.sort_key)
}
}
impl PartialOrd for RawCoordinateKey {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl RawSortKey for RawCoordinateKey {
const SERIALIZED_SIZE: Option<usize> = Some(Self::SIZE);
const EMBEDDED_IN_RECORD: bool = true;
#[inline]
fn extract(bam: &[u8], ctx: &SortContext) -> Self {
let tid = bam_fields::ref_id(bam);
let pos = bam_fields::pos(bam);
let reverse = RawRecordView::new(bam).flags() & bam_fields::flags::REVERSE != 0;
if tid < 0 { Self::unmapped() } else { Self::new(tid, pos, reverse, ctx.nref) }
}
#[inline]
fn extract_from_record(bam: &[u8]) -> Self {
let tid = bam_fields::ref_id(bam);
if tid < 0 {
return Self::unmapped();
}
let pos = bam_fields::pos(bam);
let reverse = RawRecordView::new(bam).flags() & bam_fields::flags::REVERSE != 0;
#[allow(clippy::cast_sign_loss)]
Self::new(tid, pos, reverse, (tid as u32) + 1)
}
#[inline]
fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
writer.write_all(&self.sort_key.to_le_bytes())
}
#[inline]
fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let mut buf = [0u8; 8];
reader.read_exact(&mut buf)?;
Ok(Self { sort_key: u64::from_le_bytes(buf) })
}
}
#[inline]
#[must_use]
pub const fn queryname_flag_order(flags: u16) -> u16 {
((flags & 0xc0) << 8) | ((flags & 0x100) << 3) | ((flags & 0x800) >> 3)
}
pub use fgumi_raw_bam::sort::{natural_compare, natural_compare_nul};
pub fn normalize_natural_key(name: &[u8], out: &mut Vec<u8>) {
let mut i = 0;
while i < name.len() {
if name[i].is_ascii_digit() {
while i < name.len() && name[i] == b'0' {
i += 1;
}
let sig_start = i;
while i < name.len() && name[i].is_ascii_digit() {
i += 1;
}
let sig_len = i - sig_start;
if sig_len == 0 {
out.push(1);
out.push(b'0');
} else {
#[allow(clippy::cast_possible_truncation)]
let len_byte = sig_len.min(255) as u8;
out.push(len_byte);
out.extend_from_slice(&name[sig_start..sig_start + sig_len.min(255)]);
}
} else {
out.push(name[i]);
i += 1;
}
}
}
#[inline]
fn extract_raw_name_and_flags(bam: &[u8]) -> (&[u8], u16) {
let name_len = (bam[8] as usize).saturating_sub(1);
let name =
if name_len > 0 && 32 + name_len <= bam.len() { &bam[32..32 + name_len] } else { &[] };
let raw_flags = u16::from_le_bytes([bam[14], bam[15]]);
(name, queryname_flag_order(raw_flags))
}
#[inline]
fn write_queryname_key<W: Write>(name: &[u8], flags: u16, writer: &mut W) -> std::io::Result<()> {
let name_len = u16::try_from(name.len()).map_err(|_| {
std::io::Error::new(std::io::ErrorKind::InvalidInput, "queryname too long for u16")
})?;
writer.write_all(&name_len.to_le_bytes())?;
writer.write_all(name)?;
writer.write_all(&flags.to_le_bytes())
}
#[inline]
fn read_queryname_key<R: Read>(reader: &mut R) -> std::io::Result<(Vec<u8>, u16)> {
let mut len_buf = [0u8; 2];
reader.read_exact(&mut len_buf)?;
let name_len = u16::from_le_bytes(len_buf) as usize;
let mut name = vec![0u8; name_len];
reader.read_exact(&mut name)?;
let mut flags_buf = [0u8; 2];
reader.read_exact(&mut flags_buf)?;
Ok((name, u16::from_le_bytes(flags_buf)))
}
#[derive(Clone, Debug, Default)]
pub struct RawQuerynameKey {
name: Vec<u8>,
flags: u16,
}
impl PartialEq for RawQuerynameKey {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for RawQuerynameKey {}
impl RawQuerynameKey {
#[must_use]
pub fn new(mut name: Vec<u8>, flags: u16) -> Self {
if name.last() != Some(&0) {
name.push(0);
}
Self { name, flags }
}
#[must_use]
pub fn name(&self) -> &[u8] {
&self.name
}
#[inline]
#[must_use]
fn extract_queryname_key(bam: &[u8]) -> Self {
let (raw_name, flags) = extract_raw_name_and_flags(bam);
let mut name = Vec::with_capacity(raw_name.len() + 1);
name.extend_from_slice(raw_name);
name.push(0);
Self { name, flags }
}
}
impl Ord for RawQuerynameKey {
#[inline]
#[allow(unsafe_code)]
fn cmp(&self, other: &Self) -> Ordering {
unsafe { natural_compare_nul(self.name.as_ptr(), other.name.as_ptr()) }
.then_with(|| self.flags.cmp(&other.flags))
}
}
impl PartialOrd for RawQuerynameKey {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl RawSortKey for RawQuerynameKey {
const SERIALIZED_SIZE: Option<usize> = None; const EMBEDDED_IN_RECORD: bool = true;
#[inline]
fn extract(bam: &[u8], _ctx: &SortContext) -> Self {
Self::extract_queryname_key(bam)
}
#[inline]
fn extract_from_record(bam: &[u8]) -> Self {
Self::extract_queryname_key(bam)
}
#[inline]
fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
write_queryname_key(&self.name, self.flags, writer)
}
#[inline]
fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let (name, flags) = read_queryname_key(reader)?;
Ok(Self::new(name, flags))
}
}
#[derive(Clone, Eq, PartialEq, Debug, Default)]
pub struct RawQuerynameLexKey {
name: Vec<u8>,
flags: u16,
}
impl RawQuerynameLexKey {
#[must_use]
pub fn new(name: Vec<u8>, flags: u16) -> Self {
Self { name, flags }
}
#[must_use]
pub fn name(&self) -> &[u8] {
&self.name
}
#[inline]
#[must_use]
fn extract_queryname_key(bam: &[u8]) -> Self {
let (raw_name, flags) = extract_raw_name_and_flags(bam);
Self { name: raw_name.to_vec(), flags }
}
}
impl Ord for RawQuerynameLexKey {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
self.name.cmp(&other.name).then_with(|| self.flags.cmp(&other.flags))
}
}
impl PartialOrd for RawQuerynameLexKey {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl RawSortKey for RawQuerynameLexKey {
const SERIALIZED_SIZE: Option<usize> = None;
const EMBEDDED_IN_RECORD: bool = true;
#[inline]
fn extract(bam: &[u8], _ctx: &SortContext) -> Self {
Self::extract_queryname_key(bam)
}
#[inline]
fn extract_from_record(bam: &[u8]) -> Self {
Self::extract_queryname_key(bam)
}
#[inline]
fn write_to<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
write_queryname_key(&self.name, self.flags, writer)
}
#[inline]
fn read_from<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let (name, flags) = read_queryname_key(reader)?;
Ok(Self { name, flags })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_queryname_flag_order_exact_transformation_values() {
assert_eq!(queryname_flag_order(0x0000), 0x0000);
assert_eq!(queryname_flag_order(0x0040), 0x4000);
assert_eq!(queryname_flag_order(0x0080), 0x8000);
assert_eq!(queryname_flag_order(0x0100), 0x0800);
assert_eq!(queryname_flag_order(0x0800), 0x0100);
assert_eq!(queryname_flag_order(0x0840), 0x4100);
assert_eq!(queryname_flag_order(0x0140), 0x4800);
assert_eq!(queryname_flag_order(0x0880), 0x8100);
assert_eq!(queryname_flag_order(0x0180), 0x8800);
}
#[test]
fn test_queryname_flag_order_complete_sort_order() {
let flags_in_order = [
0x0000u16, 0x0800, 0x0100, 0x0040, 0x0840, 0x0140, 0x0080, 0x0880, 0x0180, 0x00c0, 0x08c0, 0x01c0, ];
let transformed: Vec<u16> =
flags_in_order.iter().map(|&f| queryname_flag_order(f)).collect();
for i in 0..transformed.len() - 1 {
assert!(
transformed[i] < transformed[i + 1],
"Expected flags 0x{:04x} (transformed 0x{:04x}) < 0x{:04x} (transformed 0x{:04x})",
flags_in_order[i],
transformed[i],
flags_in_order[i + 1],
transformed[i + 1]
);
}
}
#[test]
fn test_queryname_flag_order_r1_before_r2() {
let r1_primary = queryname_flag_order(0x0040);
let r1_supp = queryname_flag_order(0x0840);
let r1_sec = queryname_flag_order(0x0140);
let r2_primary = queryname_flag_order(0x0080);
let r2_supp = queryname_flag_order(0x0880);
let r2_sec = queryname_flag_order(0x0180);
assert!(r1_primary < r2_primary);
assert!(r1_primary < r2_supp);
assert!(r1_primary < r2_sec);
assert!(r1_supp < r2_primary);
assert!(r1_supp < r2_supp);
assert!(r1_supp < r2_sec);
assert!(r1_sec < r2_primary);
assert!(r1_sec < r2_supp);
assert!(r1_sec < r2_sec);
}
#[test]
fn test_queryname_flag_order_primary_before_supplementary_before_secondary() {
let none_pri = queryname_flag_order(0x0000);
let none_supp = queryname_flag_order(0x0800);
let none_sec = queryname_flag_order(0x0100);
assert!(none_pri < none_supp, "NONE: PRIMARY < SUPP");
assert!(none_supp < none_sec, "NONE: SUPP < SEC");
let r1_pri = queryname_flag_order(0x0040);
let r1_supp = queryname_flag_order(0x0840);
let r1_sec = queryname_flag_order(0x0140);
assert!(r1_pri < r1_supp, "R1: PRIMARY < SUPP");
assert!(r1_supp < r1_sec, "R1: SUPP < SEC");
let r2_pri = queryname_flag_order(0x0080);
let r2_supp = queryname_flag_order(0x0880);
let r2_sec = queryname_flag_order(0x0180);
assert!(r2_pri < r2_supp, "R2: PRIMARY < SUPP");
assert!(r2_supp < r2_sec, "R2: SUPP < SEC");
}
#[test]
fn test_queryname_flag_order_none_before_r1_before_r2() {
let none = queryname_flag_order(0x0000);
let r1 = queryname_flag_order(0x0040);
let r2 = queryname_flag_order(0x0080);
assert!(none < r1, "NONE < R1");
assert!(r1 < r2, "R1 < R2");
}
#[test]
fn test_queryname_flag_order_irrelevant_flags_do_not_affect_order() {
let irrelevant_flags: u16 = 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x200 | 0x400;
let r1_base = queryname_flag_order(0x0040);
let r1_with_irrelevant = queryname_flag_order(0x0040 | irrelevant_flags);
assert_eq!(r1_base, r1_with_irrelevant, "Irrelevant flags should not change result");
let r2_supp_base = queryname_flag_order(0x0880);
let r2_supp_with = queryname_flag_order(0x0880 | irrelevant_flags);
assert_eq!(r2_supp_base, r2_supp_with);
let none_sec_base = queryname_flag_order(0x0100);
let none_sec_with = queryname_flag_order(0x0100 | irrelevant_flags);
assert_eq!(none_sec_base, none_sec_with);
}
#[test]
fn test_queryname_flag_order_real_world_flags() {
let r1_fwd = queryname_flag_order(0x0063);
assert_eq!(r1_fwd, 0x4000, "R1 fwd should extract to R1 PRIMARY");
let r2_rev = queryname_flag_order(0x0093);
assert_eq!(r2_rev, 0x8000, "R2 rev should extract to R2 PRIMARY");
let r1_supp = queryname_flag_order(0x0841);
assert_eq!(r1_supp, 0x4100, "R1 supp should extract to R1 SUPP");
let r2_sec = queryname_flag_order(0x0181);
assert_eq!(r2_sec, 0x8800, "R2 sec should extract to R2 SEC");
assert!(r1_fwd < r1_supp);
assert!(r1_supp < r2_rev);
assert!(r2_rev < r2_sec);
}
#[test]
fn test_queryname_flag_order_from_test_data() {
let f113 = queryname_flag_order(113); let f177 = queryname_flag_order(177); let f2161 = queryname_flag_order(2161);
assert!(f113 < f2161, "R1 PRIMARY (113) should be < R1 SUPP (2161): {f113} vs {f2161}");
assert!(f2161 < f177, "R1 SUPP (2161) should be < R2 PRIMARY (177): {f2161} vs {f177}");
}
#[test]
fn test_queryname_flag_order_edge_case_both_r1_r2() {
let both = queryname_flag_order(0x00c0);
assert_eq!(both, 0xc000);
let r1_only = queryname_flag_order(0x0040);
let r2_only = queryname_flag_order(0x0080);
assert!(r1_only < both);
assert!(r2_only < both);
}
#[test]
fn test_queryname_flag_order_is_const() {
const R1_PRIMARY: u16 = queryname_flag_order(0x0040);
const R2_PRIMARY: u16 = queryname_flag_order(0x0080);
assert_eq!(R1_PRIMARY, 0x4000);
assert_eq!(R2_PRIMARY, 0x8000);
}
use rstest::rstest;
#[rstest]
#[case(b"abc", b"abd", Ordering::Less)]
#[case(b"abd", b"abc", Ordering::Greater)]
#[case(b"abc", b"abc", Ordering::Equal)]
#[case(b"ABC", b"abc", Ordering::Less)] #[case(b"z", b"a", Ordering::Greater)]
#[case(b"aaa", b"aab", Ordering::Less)]
#[case(b"ZZZ", b"aaa", Ordering::Less)] fn test_natural_compare_alpha_only(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"", b"", Ordering::Equal)]
#[case(b"", b"x", Ordering::Less)]
#[case(b"x", b"", Ordering::Greater)]
#[case(b"", b"0", Ordering::Less)]
#[case(b"0", b"", Ordering::Greater)]
#[case(b"a", b"a", Ordering::Equal)]
#[case(b"0", b"0", Ordering::Equal)]
fn test_natural_compare_empty_and_single(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"abc", b"abcd", Ordering::Less)]
#[case(b"abcd", b"abc", Ordering::Greater)]
#[case(b"read", b"read1", Ordering::Less)]
#[case(b"read1", b"read", Ordering::Greater)]
#[case(b"a1", b"a1b", Ordering::Less)]
#[case(b"a1b", b"a1", Ordering::Greater)]
fn test_natural_compare_prefix_relationships(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"0", b"1", Ordering::Less)]
#[case(b"1", b"2", Ordering::Less)]
#[case(b"9", b"0", Ordering::Greater)]
#[case(b"5", b"5", Ordering::Equal)]
fn test_natural_compare_single_digit(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"2", b"10", Ordering::Less)] #[case(b"10", b"2", Ordering::Greater)]
#[case(b"9", b"10", Ordering::Less)]
#[case(b"10", b"9", Ordering::Greater)]
#[case(b"10", b"10", Ordering::Equal)]
#[case(b"99", b"100", Ordering::Less)]
#[case(b"100", b"99", Ordering::Greater)]
#[case(b"123", b"456", Ordering::Less)]
#[case(b"456", b"123", Ordering::Greater)]
#[case(b"999", b"1000", Ordering::Less)]
#[case(b"1000", b"999", Ordering::Greater)]
fn test_natural_compare_multidigit(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"01", b"1", Ordering::Greater)]
#[case(b"1", b"01", Ordering::Less)]
#[case(b"001", b"1", Ordering::Greater)]
#[case(b"001", b"01", Ordering::Greater)]
#[case(b"010", b"10", Ordering::Greater)]
#[case(b"0010", b"10", Ordering::Greater)]
#[case(b"00", b"0", Ordering::Greater)]
#[case(b"000", b"0", Ordering::Greater)]
#[case(b"02", b"1", Ordering::Greater)] #[case(b"02", b"10", Ordering::Less)] #[case(b"009", b"10", Ordering::Less)] #[case(b"0100", b"99", Ordering::Greater)] fn test_natural_compare_leading_zeros(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"0", b"a", Ordering::Less)] #[case(b"9", b"a", Ordering::Less)]
#[case(b"a", b"0", Ordering::Greater)]
#[case(b"a", b"9", Ordering::Greater)]
#[case(b"1", b"A", Ordering::Less)]
#[case(b"1", b"Z", Ordering::Less)]
#[case(b"1", b"z", Ordering::Less)]
fn test_natural_compare_digits_before_nondigits(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"a1b2", b"a1b10", Ordering::Less)]
#[case(b"a1b10", b"a1b2", Ordering::Greater)]
#[case(b"x10y20", b"x10y3", Ordering::Greater)]
#[case(b"x10y3", b"x10y20", Ordering::Less)]
#[case(b"abc123def", b"abc123deg", Ordering::Less)]
#[case(b"abc123def", b"abc124def", Ordering::Less)]
#[case(b"abc124def", b"abc123def", Ordering::Greater)]
#[case(b"file1part2", b"file1part10", Ordering::Less)]
#[case(b"file2part1", b"file10part1", Ordering::Less)]
#[case(b"v1.2.3", b"v1.2.10", Ordering::Less)] #[case(b"v1.2.10", b"v1.10.2", Ordering::Less)]
fn test_natural_compare_mixed_alphanumeric(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[test]
fn test_natural_compare_samtools_name_sort_order() {
let names: Vec<&[u8]> = vec![
b"r000", b"r001", b"r002", b"r003", b"r004", b"u1", b"x1", b"x2", b"x3", b"x4", b"x5",
b"x6",
];
for i in 0..names.len() - 1 {
assert_eq!(
natural_compare(names[i], names[i + 1]),
Ordering::Less,
"{} should sort before {}",
std::str::from_utf8(names[i]).expect("should be valid UTF-8"),
std::str::from_utf8(names[i + 1]).expect("should be valid UTF-8"),
);
}
}
#[test]
fn test_natural_compare_samtools_name2_sort_order() {
let names: Vec<&[u8]> =
vec![b"r005", b"r006", b"r007", b"x7", b"x8", b"x9", b"x10", b"x11", b"x12"];
for i in 0..names.len() - 1 {
assert_eq!(
natural_compare(names[i], names[i + 1]),
Ordering::Less,
"{} should sort before {}",
std::str::from_utf8(names[i]).expect("should be valid UTF-8"),
std::str::from_utf8(names[i + 1]).expect("should be valid UTF-8"),
);
}
}
#[rstest]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"A00132:53:HFHJKDSXX:1:1102:12345:67890",
Ordering::Less
)]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"A00132:53:HFHJKDSXX:1:1101:12346:67890",
Ordering::Less
)]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"A00132:53:HFHJKDSXX:2:1101:12345:67890",
Ordering::Less
)]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"A00132:54:HFHJKDSXX:1:1101:12345:67890",
Ordering::Less
)]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
Ordering::Equal
)]
#[case(
b"A00132:53:HFHJKDSXX:1:1101:12345:67890",
b"B00132:53:HFHJKDSXX:1:1101:12345:67890",
Ordering::Less
)]
#[case(b"INST:1:FLOW:1:1101:1:1", b"INST:1:FLOW:1:2201:1:1", Ordering::Less)]
fn test_natural_compare_illumina_names(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"SRR099966.1", b"SRR099966.2", Ordering::Less)]
#[case(b"SRR099966.9", b"SRR099966.10", Ordering::Less)]
#[case(b"SRR099966.99", b"SRR099966.100", Ordering::Less)]
#[case(b"SRR099966.12345", b"SRR099966.12346", Ordering::Less)]
#[case(b"SRR099966.12345", b"SRR099966.12345", Ordering::Equal)]
#[case(b"SRR099966.12345", b"SRR099967.1", Ordering::Less)] #[case(b"SRR1.1", b"SRR2.1", Ordering::Less)]
#[case(b"SRR9.1", b"SRR10.1", Ordering::Less)]
fn test_natural_compare_srr_names(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[rstest]
#[case(b"mol000001_read0001", b"mol000001_read0002", Ordering::Less)]
#[case(b"mol000001_read0009", b"mol000001_read0010", Ordering::Less)]
#[case(b"mol000001_read0001", b"mol000002_read0001", Ordering::Less)]
#[case(b"mol1_read1", b"mol000001_read0001", Ordering::Less)] #[case(b"mol1_read1", b"mol1_read2", Ordering::Less)]
#[case(b"mol2_read1", b"mol10_read1", Ordering::Less)]
fn test_natural_compare_synthetic_names(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[test]
fn test_natural_compare_long_numeric_runs() {
let big_a = b"x99999999999999999999999999999999999999"; let big_b = b"x100000000000000000000000000000000000000"; assert_eq!(natural_compare(big_a, big_b), Ordering::Less);
let long_num = b"prefix99999999999999999999999999suffix";
assert_eq!(natural_compare(long_num, long_num), Ordering::Equal);
let a = b"x99999999999999999999999999999999999998";
let b = b"x99999999999999999999999999999999999999";
assert_eq!(natural_compare(a, b), Ordering::Less);
}
#[test]
fn test_natural_compare_u64_overflow_boundary() {
let u64_max = b"read18446744073709551615";
let u64_max_plus_1 = b"read18446744073709551616";
assert_eq!(natural_compare(u64_max, u64_max_plus_1), Ordering::Less);
let huge_a = b"read99999999999999999999"; let huge_b = b"read100000000000000000000"; assert_eq!(natural_compare(huge_a, huge_b), Ordering::Less);
}
proptest::proptest! {
#[test]
fn test_natural_compare_symmetry(a: Vec<u8>, b: Vec<u8>) {
let ab = natural_compare(&a, &b);
let ba = natural_compare(&b, &a);
proptest::prop_assert_eq!(ab, ba.reverse());
}
#[test]
fn test_natural_compare_reflexive(s: Vec<u8>) {
proptest::prop_assert_eq!(natural_compare(&s, &s), Ordering::Equal);
}
#[test]
fn test_natural_compare_transitivity(
mut values in proptest::collection::vec(proptest::collection::vec(0..=255u8, 0..20), 3..10),
) {
values.sort_by(|a, b| natural_compare(a, b));
for i in 0..values.len() {
for j in (i + 1)..values.len() {
let cmp = natural_compare(&values[i], &values[j]);
proptest::prop_assert!(
cmp != Ordering::Greater,
"Transitivity violated: sorted[{}] ({:?}) > sorted[{}] ({:?})",
i, values[i], j, values[j],
);
}
}
}
}
#[test]
fn test_natural_compare_leading_zeros_tiebreak_by_length() {
assert_eq!(natural_compare(b"00", b"0"), Ordering::Greater);
assert_eq!(natural_compare(b"000", b"00"), Ordering::Greater);
assert_eq!(natural_compare(b"0", b"00"), Ordering::Less);
assert_eq!(natural_compare(b"01", b"1"), Ordering::Greater);
assert_eq!(natural_compare(b"00100", b"100"), Ordering::Greater);
}
#[test]
fn test_natural_compare_strnum_cmp_compat_numeric_vs_alpha_boundary() {
assert_eq!(natural_compare(b"a0", b"aa"), Ordering::Less);
assert_eq!(natural_compare(b"a9", b"aa"), Ordering::Less);
assert_eq!(natural_compare(b"aa", b"a0"), Ordering::Greater);
}
#[test]
fn test_natural_compare_strnum_cmp_compat_numeric_then_string() {
assert_eq!(natural_compare(b"a1b", b"a1c"), Ordering::Less);
assert_eq!(natural_compare(b"a10b", b"a10c"), Ordering::Less);
assert_eq!(natural_compare(b"a10b", b"a2c"), Ordering::Greater); }
#[test]
fn test_natural_compare_all_zeros_tiebreak_by_length() {
assert_eq!(natural_compare(b"0", b"00"), Ordering::Less);
assert_eq!(natural_compare(b"00", b"000"), Ordering::Less);
assert_eq!(natural_compare(b"0", b"000"), Ordering::Less);
assert_eq!(natural_compare(b"a0b", b"a00b"), Ordering::Less);
assert_eq!(natural_compare(b"a0b", b"a000b"), Ordering::Less);
}
#[rstest]
#[case(b"read1", b"read1a", Ordering::Less)] #[case(b"read1a", b"read1", Ordering::Greater)]
#[case(b"read10", b"read10a", Ordering::Less)]
#[case(b"read10a", b"read10", Ordering::Greater)]
fn test_natural_compare_numeric_then_suffix(
#[case] a: &[u8],
#[case] b: &[u8],
#[case] expected: Ordering,
) {
assert_eq!(natural_compare(a, b), expected);
}
#[test]
fn test_natural_compare_multiple_numeric_segments() {
assert_eq!(natural_compare(b"r1.1.1", b"r1.1.2"), Ordering::Less);
assert_eq!(natural_compare(b"r1.1.9", b"r1.1.10"), Ordering::Less);
assert_eq!(natural_compare(b"r1.9.1", b"r1.10.1"), Ordering::Less);
assert_eq!(natural_compare(b"r9.1.1", b"r10.1.1"), Ordering::Less);
}
#[test]
fn test_natural_compare_adjacent_numeric_runs() {
assert_eq!(natural_compare(b"a1b2", b"a1b10"), Ordering::Less);
assert_eq!(natural_compare(b"a12", b"a9"), Ordering::Greater);
}
#[test]
fn test_natural_compare_high_bytes() {
assert_eq!(natural_compare(&[0xFF], &[0xFE]), Ordering::Greater);
assert_eq!(natural_compare(&[0x80], &[0x7F]), Ordering::Greater);
assert_eq!(natural_compare(b"0", &[0x80]), Ordering::Less);
}
#[test]
fn test_natural_compare_sort_illumina_batch() {
let names: Vec<&[u8]> = vec![
b"INST:100:FLOW:2:2201:9999:1",
b"INST:1:FLOW:1:1101:1:1",
b"INST:1:FLOW:1:1101:1:2",
b"INST:1:FLOW:1:1101:2:1",
b"INST:1:FLOW:1:1102:1:1",
b"INST:1:FLOW:2:1101:1:1",
b"INST:2:FLOW:1:1101:1:1",
b"INST:10:FLOW:1:1101:1:1",
b"INST:100:FLOW:1:1101:1:1",
];
let mut sorted = names.clone();
sorted.sort_by(|a, b| natural_compare(a, b));
let expected: Vec<&[u8]> = vec![
b"INST:1:FLOW:1:1101:1:1",
b"INST:1:FLOW:1:1101:1:2",
b"INST:1:FLOW:1:1101:2:1",
b"INST:1:FLOW:1:1102:1:1",
b"INST:1:FLOW:2:1101:1:1",
b"INST:2:FLOW:1:1101:1:1",
b"INST:10:FLOW:1:1101:1:1",
b"INST:100:FLOW:1:1101:1:1",
b"INST:100:FLOW:2:2201:9999:1",
];
assert_eq!(sorted, expected);
}
#[test]
fn test_natural_compare_sort_mixed_name_formats() {
let mut names: Vec<&[u8]> = vec![
b"SRR099966.100",
b"SRR099966.9",
b"SRR099966.10",
b"SRR099966.1",
b"SRR099966.99",
b"SRR099966.2",
];
names.sort_by(|a, b| natural_compare(a, b));
let expected: Vec<&[u8]> = vec![
b"SRR099966.1",
b"SRR099966.2",
b"SRR099966.9",
b"SRR099966.10",
b"SRR099966.99",
b"SRR099966.100",
];
assert_eq!(names, expected);
}
#[test]
fn test_queryname_comparator_default_is_lexicographic() {
assert_eq!(QuerynameComparator::default(), QuerynameComparator::Lexicographic);
}
#[test]
fn test_queryname_comparator_display() {
assert_eq!(QuerynameComparator::Lexicographic.to_string(), "lexicographic");
assert_eq!(QuerynameComparator::Natural.to_string(), "natural");
}
#[test]
fn test_sort_order_is_queryname() {
assert!(!SortOrder::Coordinate.is_queryname());
assert!(SortOrder::Queryname(QuerynameComparator::Lexicographic).is_queryname());
assert!(SortOrder::Queryname(QuerynameComparator::Natural).is_queryname());
assert!(!SortOrder::TemplateCoordinate.is_queryname());
}
#[test]
fn test_sort_order_queryname_comparator() {
assert_eq!(SortOrder::Coordinate.queryname_comparator(), None);
assert_eq!(
SortOrder::Queryname(QuerynameComparator::Lexicographic).queryname_comparator(),
Some(QuerynameComparator::Lexicographic)
);
assert_eq!(
SortOrder::Queryname(QuerynameComparator::Natural).queryname_comparator(),
Some(QuerynameComparator::Natural)
);
assert_eq!(SortOrder::TemplateCoordinate.queryname_comparator(), None);
}
#[test]
fn test_sort_order_header_so_tag() {
assert_eq!(SortOrder::Coordinate.header_so_tag(), "coordinate");
assert_eq!(
SortOrder::Queryname(QuerynameComparator::Lexicographic).header_so_tag(),
"queryname"
);
assert_eq!(SortOrder::Queryname(QuerynameComparator::Natural).header_so_tag(), "queryname");
assert_eq!(SortOrder::TemplateCoordinate.header_so_tag(), "unsorted");
}
#[test]
fn test_sort_order_header_ss_tag() {
assert_eq!(SortOrder::Coordinate.header_ss_tag(), None);
assert_eq!(
SortOrder::Queryname(QuerynameComparator::Lexicographic).header_ss_tag(),
Some("lexicographic")
);
assert_eq!(
SortOrder::Queryname(QuerynameComparator::Natural).header_ss_tag(),
Some("natural")
);
assert_eq!(SortOrder::TemplateCoordinate.header_ss_tag(), Some("template-coordinate"));
}
#[test]
fn test_sort_order_header_go_tag() {
assert_eq!(SortOrder::Coordinate.header_go_tag(), None);
assert_eq!(SortOrder::Queryname(QuerynameComparator::Lexicographic).header_go_tag(), None);
assert_eq!(SortOrder::TemplateCoordinate.header_go_tag(), Some("query"));
}
#[test]
fn test_lex_key_lexicographic_ordering() {
let k1 = RawQuerynameLexKey::new(b"read10".to_vec(), 0);
let k2 = RawQuerynameLexKey::new(b"read2".to_vec(), 0);
assert!(k1 < k2, "lexicographic: 'read10' should be < 'read2'");
}
#[test]
fn test_natural_key_natural_ordering() {
let k1 = RawQuerynameKey::new(b"read2".to_vec(), 0);
let k2 = RawQuerynameKey::new(b"read10".to_vec(), 0);
assert!(k1 < k2, "natural: 'read2' should be < 'read10'");
}
#[test]
fn test_lex_vs_natural_ordering_difference() {
let lex_10 = RawQuerynameLexKey::new(b"read10".to_vec(), 0);
let lex_2 = RawQuerynameLexKey::new(b"read2".to_vec(), 0);
assert!(lex_10 < lex_2, "lexicographic: read10 < read2");
let nat_2 = RawQuerynameKey::new(b"read2".to_vec(), 0);
let nat_10 = RawQuerynameKey::new(b"read10".to_vec(), 0);
assert!(nat_2 < nat_10, "natural: read2 < read10");
}
#[test]
fn test_lex_key_flag_tiebreak() {
let k1 = RawQuerynameLexKey::new(b"readA".to_vec(), 0);
let k2 = RawQuerynameLexKey::new(b"readA".to_vec(), 1);
assert!(k1 < k2);
}
#[test]
fn test_lex_key_empty_names() {
let k1 = RawQuerynameLexKey::new(Vec::new(), 0);
let k2 = RawQuerynameLexKey::new(b"a".to_vec(), 0);
assert!(k1 < k2);
}
#[test]
fn test_lex_key_illumina_names() {
let mut names: Vec<RawQuerynameLexKey> = vec![
RawQuerynameLexKey::new(b"A00132:53:HFHJKDSXX:2:1100:5000:1000".to_vec(), 0),
RawQuerynameLexKey::new(b"A00132:53:HFHJKDSXX:1:1100:5000:1000".to_vec(), 0),
RawQuerynameLexKey::new(b"A00132:54:HFH2JDSXX:1:1100:5000:1000".to_vec(), 0),
];
names.sort();
assert_eq!(names[0].name(), b"A00132:53:HFHJKDSXX:1:1100:5000:1000");
assert_eq!(names[1].name(), b"A00132:53:HFHJKDSXX:2:1100:5000:1000");
assert_eq!(names[2].name(), b"A00132:54:HFH2JDSXX:1:1100:5000:1000");
}
#[test]
fn test_lex_key_srr_names_differ_from_natural() {
let mut lex_names: Vec<RawQuerynameLexKey> = vec![
RawQuerynameLexKey::new(b"SRR099966.100".to_vec(), 0),
RawQuerynameLexKey::new(b"SRR099966.2".to_vec(), 0),
RawQuerynameLexKey::new(b"SRR099966.10".to_vec(), 0),
];
lex_names.sort();
assert_eq!(lex_names[0].name(), b"SRR099966.10");
assert_eq!(lex_names[1].name(), b"SRR099966.100");
assert_eq!(lex_names[2].name(), b"SRR099966.2");
let mut nat_names: Vec<RawQuerynameKey> = vec![
RawQuerynameKey::new(b"SRR099966.100".to_vec(), 0),
RawQuerynameKey::new(b"SRR099966.2".to_vec(), 0),
RawQuerynameKey::new(b"SRR099966.10".to_vec(), 0),
];
nat_names.sort();
assert_eq!(nat_names[0].name(), b"SRR099966.2\0");
assert_eq!(nat_names[1].name(), b"SRR099966.10\0");
assert_eq!(nat_names[2].name(), b"SRR099966.100\0");
}
#[test]
fn test_lex_key_serialization_roundtrip() {
let key = RawQuerynameLexKey::new(b"test_read".to_vec(), 42);
let mut buf = Vec::new();
key.write_to(&mut buf).expect("write_to should succeed");
let mut cursor = std::io::Cursor::new(&buf);
let restored =
RawQuerynameLexKey::read_from(&mut cursor).expect("read_from should succeed");
assert_eq!(key, restored);
}
#[test]
fn test_lex_key_variable_length() {
const { assert!(RawQuerynameLexKey::SERIALIZED_SIZE.is_none()) };
}
#[test]
fn test_queryname_keys_embedded_in_record() {
const { assert!(RawQuerynameLexKey::EMBEDDED_IN_RECORD) };
const { assert!(RawQuerynameKey::EMBEDDED_IN_RECORD) };
const { assert!(RawCoordinateKey::EMBEDDED_IN_RECORD) };
}
}