use anyhow::Result;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::header::Header;
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::sync::Arc;
use bstr::ByteSlice;
use crate::sam::SamTag;
use crate::template::Template;
pub type LibraryLookup = Arc<HashMap<String, Arc<str>>>;
static UNKNOWN_LIBRARY: std::sync::LazyLock<Arc<str>> =
std::sync::LazyLock::new(|| Arc::from("unknown"));
#[must_use]
pub fn build_library_lookup(header: &Header) -> LibraryLookup {
let mut lookup = HashMap::new();
for (id, rg) in header.read_groups() {
let library: Arc<str> = rg
.other_fields()
.get(&rg_tag::LIBRARY)
.map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
lookup.insert(id.to_string(), library);
}
Arc::new(lookup)
}
#[derive(Debug, Clone)]
pub struct LibraryIndex {
lookup: ahash::AHashMap<u64, u16>,
names: Vec<Arc<str>>,
unknown_idx: u16,
}
impl LibraryIndex {
#[must_use]
pub fn from_header(header: &Header) -> Self {
let mut lookup = ahash::AHashMap::new();
let mut names = vec![Arc::clone(&UNKNOWN_LIBRARY)]; let mut library_to_idx: ahash::AHashMap<Arc<str>, u16> = ahash::AHashMap::new();
library_to_idx.insert(Arc::clone(&UNKNOWN_LIBRARY), 0);
for (id, rg) in header.read_groups() {
let library: Arc<str> = rg
.other_fields()
.get(&rg_tag::LIBRARY)
.map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
let lib_idx = *library_to_idx.entry(library.clone()).or_insert_with(|| {
let idx: u16 =
names.len().try_into().expect("too many distinct libraries for u16 index");
names.push(library);
idx
});
let rg_hash = Self::hash_rg(id.as_bytes());
lookup.insert(rg_hash, lib_idx);
}
Self { lookup, names, unknown_idx: 0 }
}
#[must_use]
pub fn get(&self, rg_hash: u64) -> u16 {
*self.lookup.get(&rg_hash).unwrap_or(&self.unknown_idx)
}
#[must_use]
pub fn library_name(&self, idx: u16) -> &Arc<str> {
self.names.get(idx as usize).unwrap_or(&self.names[0])
}
#[must_use]
pub fn hash_bytes(bytes: Option<&[u8]>) -> u64 {
use ahash::AHasher;
use std::hash::{Hash, Hasher};
match bytes {
Some(b) => {
let mut hasher = AHasher::default();
b.hash(&mut hasher);
hasher.finish()
}
None => 0,
}
}
#[must_use]
pub fn hash_rg(rg_bytes: &[u8]) -> u64 {
Self::hash_bytes(Some(rg_bytes))
}
#[must_use]
pub fn hash_cell_barcode(cell_bytes: Option<&[u8]>) -> u64 {
Self::hash_bytes(cell_bytes)
}
#[must_use]
pub fn hash_name(name_bytes: Option<&[u8]>) -> u64 {
Self::hash_bytes(name_bytes)
}
}
impl Default for LibraryIndex {
fn default() -> Self {
Self {
lookup: ahash::AHashMap::new(),
names: vec![Arc::clone(&UNKNOWN_LIBRARY)],
unknown_idx: 0,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ReadInfo {
pub ref_index1: i32,
pub start1: i32,
pub strand1: u8,
pub ref_index2: i32,
pub start2: i32,
pub strand2: u8,
pub library: Arc<str>,
pub cell_barcode: Option<String>,
}
impl ReadInfo {
const UNKNOWN_REF: i32 = i32::MAX;
const UNKNOWN_POS: i32 = i32::MAX;
const UNKNOWN_STRAND: u8 = u8::MAX;
#[must_use]
#[allow(clippy::too_many_arguments)]
pub fn new(
ref_index1: i32,
start1: i32,
strand1: u8,
ref_index2: i32,
start2: i32,
strand2: u8,
library: Arc<str>,
cell_barcode: Option<String>,
) -> Self {
let r1_earlier = (ref_index1, start1, strand1) <= (ref_index2, start2, strand2);
if r1_earlier {
Self { ref_index1, start1, strand1, ref_index2, start2, strand2, library, cell_barcode }
} else {
Self {
ref_index1: ref_index2,
start1: start2,
strand1: strand2,
ref_index2: ref_index1,
start2: start1,
strand2: strand1,
library,
cell_barcode,
}
}
}
#[must_use]
pub fn single(
ref_index: i32,
start: i32,
strand: u8,
library: Arc<str>,
cell_barcode: Option<String>,
) -> Self {
Self {
ref_index1: ref_index,
start1: start,
strand1: strand,
ref_index2: Self::UNKNOWN_REF,
start2: Self::UNKNOWN_POS,
strand2: Self::UNKNOWN_STRAND,
library,
cell_barcode,
}
}
#[must_use]
pub fn unmapped(library: Arc<str>, cell_barcode: Option<String>) -> Self {
Self {
ref_index1: Self::UNKNOWN_REF,
start1: Self::UNKNOWN_POS,
strand1: Self::UNKNOWN_STRAND,
ref_index2: Self::UNKNOWN_REF,
start2: Self::UNKNOWN_POS,
strand2: Self::UNKNOWN_STRAND,
library,
cell_barcode,
}
}
#[must_use]
pub fn strand_to_byte(positive: bool) -> u8 {
u8::from(!positive)
}
pub fn from(
template: &Template,
cell_tag: Tag,
library_lookup: &LibraryLookup,
) -> Result<Self> {
use crate::sort::bam_fields;
use fgumi_raw_bam::{RawRecord, RawRecordView, TagValue};
let r1 = template.r1();
let r2 = template.r2();
let record: &RawRecord = r1.or(r2).ok_or_else(|| {
anyhow::anyhow!(
"Template '{}' has no records (empty template)",
String::from_utf8_lossy(&template.name)
)
})?;
let library: Arc<str> =
if let Some(TagValue::String(rg_bytes)) = record.tags().get(SamTag::RG.as_ref()) {
let rg_id = std::str::from_utf8(rg_bytes).unwrap_or("unknown");
library_lookup.get(rg_id).cloned().unwrap_or_else(|| Arc::clone(&UNKNOWN_LIBRARY))
} else {
Arc::clone(&UNKNOWN_LIBRARY)
};
let cell_barcode =
if let Some(TagValue::String(cb_bytes)) = record.tags().get(cell_tag.as_ref()) {
Some(String::from_utf8_lossy(cb_bytes).into_owned())
} else {
None
};
let is_unmapped =
|r: &RawRecord| (RawRecordView::new(r).flags() & bam_fields::flags::UNMAPPED) != 0;
let raw_info = |r: &RawRecord| -> Result<(i32, i32, u8)> {
Ok((
bam_fields::ref_id(r.as_ref()),
get_unclipped_position_raw(r)?,
u8::from((RawRecordView::new(r).flags() & bam_fields::flags::REVERSE) != 0),
))
};
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
if let (Some(r1), Some(r2)) = (r1, r2) {
match (is_unmapped(r1), is_unmapped(r2)) {
(true, true) => Ok(ReadInfo::unmapped(library, cell_barcode)),
(true, false) => {
let (ref_index, start, strand) = raw_info(r2)?;
Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
}
(false, true) => {
let (ref_index, start, strand) = raw_info(r1)?;
Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
}
(false, false) => {
let (ref_index1, start1, strand1) = raw_info(r1)?;
let (ref_index2, start2, strand2) = raw_info(r2)?;
Ok(ReadInfo::new(
ref_index1,
start1,
strand1,
ref_index2,
start2,
strand2,
library,
cell_barcode,
))
}
}
} else {
if is_unmapped(record) {
return Ok(ReadInfo::unmapped(library, cell_barcode));
}
let (ref_index, start, strand) = raw_info(record)?;
Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
}
}
}
impl PartialOrd for ReadInfo {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ReadInfo {
fn cmp(&self, other: &Self) -> Ordering {
self.ref_index1
.cmp(&other.ref_index1)
.then_with(|| self.start1.cmp(&other.start1))
.then_with(|| self.strand1.cmp(&other.strand1))
.then_with(|| self.ref_index2.cmp(&other.ref_index2))
.then_with(|| self.start2.cmp(&other.start2))
.then_with(|| self.strand2.cmp(&other.strand2))
.then_with(|| self.library.cmp(&other.library))
.then_with(|| self.cell_barcode.cmp(&other.cell_barcode))
}
}
fn get_unclipped_position_raw(record: &fgumi_raw_bam::RawRecord) -> Result<i32> {
let pos = fgumi_raw_bam::unclipped_5prime_from_raw_bam(record.as_ref());
if pos == i32::MAX {
let read_name = String::from_utf8_lossy(fgumi_raw_bam::read_name(record.as_ref()));
Err(anyhow::anyhow!("Mapped read '{read_name}' missing alignment start position"))
} else {
Ok(pos)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_read_info_ordering() {
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1, info2);
}
#[test]
fn test_read_info_normalizes_order() {
let info1 = ReadInfo::new(0, 200, 1, 0, 100, 0, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1.ref_index1, 0);
assert_eq!(info1.start1, 100);
assert_eq!(info1.strand1, 0);
assert_eq!(info1.ref_index2, 0);
assert_eq!(info1.start2, 200);
assert_eq!(info1.strand2, 1);
assert_eq!(info1, info2);
}
#[test]
fn test_read_info_unmapped() {
let info = ReadInfo::unmapped(Arc::from("lib1"), None);
assert_eq!(info.ref_index1, ReadInfo::UNKNOWN_REF);
assert_eq!(info.start1, ReadInfo::UNKNOWN_POS);
}
#[test]
fn test_read_info_single() {
let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
assert_eq!(info.ref_index1, 1);
assert_eq!(info.start1, 500);
assert_eq!(info.strand1, 0);
assert_eq!(info.ref_index2, ReadInfo::UNKNOWN_REF);
}
#[test]
fn test_strand_to_byte() {
assert_eq!(ReadInfo::strand_to_byte(true), 0);
assert_eq!(ReadInfo::strand_to_byte(false), 1);
}
#[test]
fn test_read_info_comparison() {
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 150, 0, 0, 200, 1, Arc::from("lib1"), None);
assert!(info1 < info2);
}
#[test]
fn test_different_chromosomes() {
let info1 = ReadInfo::new(0, 100, 0, 1, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(1, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1.ref_index1, 0);
assert_eq!(info1.start1, 100);
assert_eq!(info2.ref_index1, 0);
assert_eq!(info2.start1, 200);
assert_ne!(info1, info2);
}
#[test]
fn test_build_library_lookup() {
use bstr::BString;
use noodles::sam::header::record::value::Map;
use noodles::sam::header::record::value::map::ReadGroup;
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
let mut header = Header::builder();
let rg1 = Map::<ReadGroup>::builder()
.insert(rg_tag::LIBRARY, String::from("libA"))
.build()
.expect("building read group with library should succeed");
header = header.add_read_group(BString::from("RG1"), rg1);
let rg2 = Map::<ReadGroup>::builder()
.insert(rg_tag::LIBRARY, String::from("libB"))
.build()
.expect("building read group with library should succeed");
header = header.add_read_group(BString::from("RG2"), rg2);
let rg3 = Map::<ReadGroup>::builder().build().expect("build should succeed");
header = header.add_read_group(BString::from("RG3"), rg3);
let header = header.build();
let lookup = super::build_library_lookup(&header);
assert_eq!(lookup.len(), 3);
assert_eq!(lookup.get("RG1").map(AsRef::as_ref), Some("libA"));
assert_eq!(lookup.get("RG2").map(AsRef::as_ref), Some("libB"));
assert_eq!(lookup.get("RG3").map(AsRef::as_ref), Some("unknown"));
}
#[test]
fn test_single_uses_unknown_values() {
let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
assert_eq!(info.ref_index1, 1);
assert_eq!(info.start1, 500);
assert_eq!(info.strand1, 0);
assert_eq!(info.ref_index2, i32::MAX);
assert_eq!(info.start2, i32::MAX);
assert_eq!(info.strand2, u8::MAX);
}
#[test]
fn test_single_sorts_after_paired() {
let paired = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let single = ReadInfo::single(0, 100, 0, Arc::from("lib1"), None);
assert!(single > paired);
}
#[test]
fn test_ord_includes_cell_barcode() {
let lib: Arc<str> = Arc::from("lib1");
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell1".to_string()));
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell2".to_string()));
let info3 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib, None);
assert!(info1 < info2);
assert!(info3 < info1);
assert_ne!(info1, info2);
assert_ne!(info1, info3);
}
#[test]
fn test_normalization_with_strand_tiebreaker() {
let info1 = ReadInfo::new(0, 100, 1, 0, 100, 0, Arc::from("lib1"), None);
assert_eq!(info1.strand1, 0);
assert_eq!(info1.strand2, 1);
}
mod get_unclipped_position_tests {
use super::*;
use fgumi_raw_bam::{RawRecord, SamBuilder as RawSamBuilder, flags as raw_flags};
fn parse_cigar_to_raw(cigar: &str) -> Vec<u32> {
let mut ops = Vec::new();
let mut num_str = String::new();
for c in cigar.chars() {
if c.is_ascii_digit() {
num_str.push(c);
} else {
let len: u32 = num_str.parse().expect("Invalid CIGAR: expected number");
let op_type: u32 = match c {
'M' => 0,
'I' => 1,
'D' => 2,
'N' => 3,
'S' => 4,
'H' => 5,
'P' => 6,
'=' => 7,
'X' => 8,
_ => panic!("Unknown CIGAR operation: {c}"),
};
ops.push((len << 4) | op_type);
num_str.clear();
}
}
ops
}
fn cigar_query_len(cigar: &str) -> usize {
let mut num_str = String::new();
let mut total = 0usize;
for c in cigar.chars() {
if c.is_ascii_digit() {
num_str.push(c);
} else {
let len: usize = num_str.parse().expect("number");
if matches!(c, 'M' | 'I' | 'S' | '=' | 'X') {
total += len;
}
num_str.clear();
}
}
total
}
fn build_record(alignment_start: usize, cigar: &str, reverse: bool) -> RawRecord {
let raw_cigar = parse_cigar_to_raw(cigar);
let qlen = cigar_query_len(cigar).max(1);
let flags: u16 = if reverse { raw_flags::REVERSE } else { 0 };
let mut b = RawSamBuilder::new();
b.sequence(&vec![b'A'; qlen])
.qualities(&vec![30u8; qlen])
.flags(flags)
.ref_id(0)
.pos(i32::try_from(alignment_start).expect("alignment_start fits i32") - 1)
.cigar_ops(&raw_cigar);
b.build()
}
#[test]
fn test_forward_strand_soft_clip_only() {
let record = build_record(100, "5S50M", false);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 95);
}
#[test]
fn test_forward_strand_hard_clip_only() {
let record = build_record(100, "10H50M", false);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 90);
}
#[test]
fn test_forward_strand_hard_and_soft_clip() {
let record = build_record(100, "10H5S50M", false);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 85);
}
#[test]
fn test_reverse_strand_soft_clip_only() {
let record = build_record(100, "50M5S", true);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 154);
}
#[test]
fn test_reverse_strand_hard_clip_only() {
let record = build_record(100, "50M10H", true);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 159);
}
#[test]
fn test_reverse_strand_hard_and_soft_clip() {
let record = build_record(100, "50M5S10H", true);
let pos = get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed");
assert_eq!(pos, 164);
}
#[test]
fn test_no_clipping() {
let forward_record = build_record(100, "50M", false);
assert_eq!(
get_unclipped_position_raw(&forward_record)
.expect("get_unclipped_position_raw should succeed"),
100
);
let reverse_record = build_record(100, "50M", true);
assert_eq!(
get_unclipped_position_raw(&reverse_record)
.expect("get_unclipped_position_raw should succeed"),
149
);
}
#[test]
fn test_unmapped_read() {
let mut b = RawSamBuilder::new();
b.sequence(b"ACGT").qualities(&[30; 4]).flags(raw_flags::UNMAPPED);
let record = b.build();
assert_eq!(
get_unclipped_position_raw(&record)
.expect("get_unclipped_position_raw should succeed"),
0
);
}
#[test]
fn test_complex_cigar_with_insertions_deletions() {
let forward_record = build_record(100, "5H3S10M2I5M3D10M4S6H", false);
assert_eq!(
get_unclipped_position_raw(&forward_record)
.expect("get_unclipped_position_raw should succeed"),
92
);
let reverse_record = build_record(100, "5H3S10M2I5M3D10M4S6H", true);
assert_eq!(
get_unclipped_position_raw(&reverse_record)
.expect("get_unclipped_position_raw should succeed"),
137
);
}
}
}