use anyhow::Result;
use noodles::sam::alignment::record::data::field::Tag;
use noodles::sam::header::Header;
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::sync::Arc;
use bstr::ByteSlice;
use crate::sam::SamTag;
use crate::sam::record_utils::{
mate_unclipped_end, mate_unclipped_start, unclipped_five_prime_position,
};
use crate::template::Template;
use crate::unified_pipeline::GroupKey;
use noodles::sam;
use noodles::sam::alignment::record_buf::data::field::value::Value as DataValue;
pub type LibraryLookup = Arc<HashMap<String, Arc<str>>>;
static UNKNOWN_LIBRARY: std::sync::LazyLock<Arc<str>> =
std::sync::LazyLock::new(|| Arc::from("unknown"));
#[must_use]
pub fn build_library_lookup(header: &Header) -> LibraryLookup {
let mut lookup = HashMap::new();
for (id, rg) in header.read_groups() {
let library: Arc<str> = rg
.other_fields()
.get(&rg_tag::LIBRARY)
.map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
lookup.insert(id.to_string(), library);
}
Arc::new(lookup)
}
#[derive(Debug, Clone)]
pub struct LibraryIndex {
lookup: ahash::AHashMap<u64, u16>,
names: Vec<Arc<str>>,
unknown_idx: u16,
}
impl LibraryIndex {
#[must_use]
pub fn from_header(header: &Header) -> Self {
let mut lookup = ahash::AHashMap::new();
let mut names = vec![Arc::clone(&UNKNOWN_LIBRARY)]; let mut library_to_idx: ahash::AHashMap<Arc<str>, u16> = ahash::AHashMap::new();
library_to_idx.insert(Arc::clone(&UNKNOWN_LIBRARY), 0);
for (id, rg) in header.read_groups() {
let library: Arc<str> = rg
.other_fields()
.get(&rg_tag::LIBRARY)
.map_or_else(|| Arc::clone(&UNKNOWN_LIBRARY), |s| Arc::from(s.to_string()));
let lib_idx = *library_to_idx.entry(library.clone()).or_insert_with(|| {
let idx: u16 =
names.len().try_into().expect("too many distinct libraries for u16 index");
names.push(library);
idx
});
let rg_hash = Self::hash_rg(id.as_bytes());
lookup.insert(rg_hash, lib_idx);
}
Self { lookup, names, unknown_idx: 0 }
}
#[must_use]
pub fn get(&self, rg_hash: u64) -> u16 {
*self.lookup.get(&rg_hash).unwrap_or(&self.unknown_idx)
}
#[must_use]
pub fn library_name(&self, idx: u16) -> &Arc<str> {
self.names.get(idx as usize).unwrap_or(&self.names[0])
}
#[must_use]
pub fn hash_bytes(bytes: Option<&[u8]>) -> u64 {
use ahash::AHasher;
use std::hash::{Hash, Hasher};
match bytes {
Some(b) => {
let mut hasher = AHasher::default();
b.hash(&mut hasher);
hasher.finish()
}
None => 0,
}
}
#[must_use]
pub fn hash_rg(rg_bytes: &[u8]) -> u64 {
Self::hash_bytes(Some(rg_bytes))
}
#[must_use]
pub fn hash_cell_barcode(cell_bytes: Option<&[u8]>) -> u64 {
Self::hash_bytes(cell_bytes)
}
#[must_use]
pub fn hash_name(name_bytes: Option<&[u8]>) -> u64 {
Self::hash_bytes(name_bytes)
}
}
impl Default for LibraryIndex {
fn default() -> Self {
Self {
lookup: ahash::AHashMap::new(),
names: vec![Arc::clone(&UNKNOWN_LIBRARY)],
unknown_idx: 0,
}
}
}
#[must_use]
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
pub fn compute_group_key(
record: &sam::alignment::RecordBuf,
library_index: &LibraryIndex,
cell_tag: Option<Tag>,
) -> GroupKey {
let name_hash = LibraryIndex::hash_name(record.name().map(AsRef::as_ref));
let flags = record.flags();
if flags.is_secondary() || flags.is_supplementary() {
return GroupKey { name_hash, ..GroupKey::default() };
}
let ref_id =
record.reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
let pos = get_unclipped_position_for_groupkey(record);
let strand = u8::from(flags.is_reverse_complemented());
let library_idx =
if let Some(DataValue::String(rg_bytes)) = record.data().get(&Tag::from(SamTag::RG)) {
let rg_hash = LibraryIndex::hash_rg(rg_bytes);
library_index.get(rg_hash)
} else {
0 };
let cell_hash = if let Some(tag) = cell_tag {
if let Some(DataValue::String(cb_bytes)) = record.data().get(&tag) {
LibraryIndex::hash_cell_barcode(Some(cb_bytes))
} else {
0
}
} else {
0
};
let is_paired = flags.is_segmented();
if !is_paired {
return GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash);
}
let mate_strand = u8::from(flags.is_mate_reverse_complemented());
let mate_ref_id =
record.mate_reference_sequence_id().map_or(-1, |id| i32::try_from(id).unwrap_or(i32::MAX));
let mate_pos = if mate_strand == 1 {
mate_unclipped_end(record).map(|p| p as i32)
} else {
mate_unclipped_start(record).map(|p| p as i32)
};
match mate_pos {
Some(mp) => GroupKey::paired(
ref_id,
pos,
strand,
mate_ref_id,
mp,
mate_strand,
library_idx,
cell_hash,
name_hash,
),
None => {
GroupKey::single(ref_id, pos, strand, library_idx, cell_hash, name_hash)
}
}
}
#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
fn get_unclipped_position_for_groupkey(record: &sam::alignment::RecordBuf) -> i32 {
if record.flags().is_unmapped() {
return 0;
}
unclipped_five_prime_position(record).map_or(i32::MAX, |pos| pos as i32)
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ReadInfo {
pub ref_index1: i32,
pub start1: i32,
pub strand1: u8,
pub ref_index2: i32,
pub start2: i32,
pub strand2: u8,
pub library: Arc<str>,
pub cell_barcode: Option<String>,
}
impl ReadInfo {
const UNKNOWN_REF: i32 = i32::MAX;
const UNKNOWN_POS: i32 = i32::MAX;
const UNKNOWN_STRAND: u8 = u8::MAX;
#[must_use]
#[allow(clippy::too_many_arguments)]
pub fn new(
ref_index1: i32,
start1: i32,
strand1: u8,
ref_index2: i32,
start2: i32,
strand2: u8,
library: Arc<str>,
cell_barcode: Option<String>,
) -> Self {
let r1_earlier = (ref_index1, start1, strand1) <= (ref_index2, start2, strand2);
if r1_earlier {
Self { ref_index1, start1, strand1, ref_index2, start2, strand2, library, cell_barcode }
} else {
Self {
ref_index1: ref_index2,
start1: start2,
strand1: strand2,
ref_index2: ref_index1,
start2: start1,
strand2: strand1,
library,
cell_barcode,
}
}
}
#[must_use]
pub fn single(
ref_index: i32,
start: i32,
strand: u8,
library: Arc<str>,
cell_barcode: Option<String>,
) -> Self {
Self {
ref_index1: ref_index,
start1: start,
strand1: strand,
ref_index2: Self::UNKNOWN_REF,
start2: Self::UNKNOWN_POS,
strand2: Self::UNKNOWN_STRAND,
library,
cell_barcode,
}
}
#[must_use]
pub fn unmapped(library: Arc<str>, cell_barcode: Option<String>) -> Self {
Self {
ref_index1: Self::UNKNOWN_REF,
start1: Self::UNKNOWN_POS,
strand1: Self::UNKNOWN_STRAND,
ref_index2: Self::UNKNOWN_REF,
start2: Self::UNKNOWN_POS,
strand2: Self::UNKNOWN_STRAND,
library,
cell_barcode,
}
}
#[must_use]
pub fn strand_to_byte(positive: bool) -> u8 {
u8::from(!positive)
}
pub fn from(
template: &Template,
cell_tag: Tag,
library_lookup: &LibraryLookup,
) -> Result<Self> {
let r1 = template.r1();
let r2 = template.r2();
let record = r1.as_ref().or(r2.as_ref()).ok_or_else(|| {
anyhow::anyhow!(
"Template '{}' has no records (empty template)",
String::from_utf8_lossy(&template.name)
)
})?;
let library: Arc<str> =
if let Some(DataValue::String(rg_bytes)) = record.data().get(&Tag::from(SamTag::RG)) {
let rg_id = std::str::from_utf8(rg_bytes).unwrap_or("unknown");
library_lookup.get(rg_id).cloned().unwrap_or_else(|| Arc::clone(&UNKNOWN_LIBRARY))
} else {
Arc::clone(&UNKNOWN_LIBRARY)
};
let cell_barcode = if let Some(DataValue::String(cb_str)) = record.data().get(&cell_tag) {
Some(String::from_utf8_lossy(cb_str).into_owned())
} else {
None
};
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
if let (Some(r1), Some(r2)) = (&r1, &r2) {
let ref_index1 = r1.reference_sequence_id().map_or(-1, |id| id as i32);
let start1 = get_unclipped_position(r1)?;
let strand1 = u8::from(r1.flags().is_reverse_complemented());
let ref_index2 = r2.reference_sequence_id().map_or(-1, |id| id as i32);
let start2 = get_unclipped_position(r2)?;
let strand2 = u8::from(r2.flags().is_reverse_complemented());
Ok(ReadInfo::new(
ref_index1,
start1,
strand1,
ref_index2,
start2,
strand2,
library,
cell_barcode,
))
} else {
let ref_index = record.reference_sequence_id().map_or(-1, |id| id as i32);
let start = get_unclipped_position(record)?;
let strand = u8::from(record.flags().is_reverse_complemented());
Ok(ReadInfo::single(ref_index, start, strand, library, cell_barcode))
}
}
}
impl PartialOrd for ReadInfo {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ReadInfo {
fn cmp(&self, other: &Self) -> Ordering {
self.ref_index1
.cmp(&other.ref_index1)
.then_with(|| self.start1.cmp(&other.start1))
.then_with(|| self.strand1.cmp(&other.strand1))
.then_with(|| self.ref_index2.cmp(&other.ref_index2))
.then_with(|| self.start2.cmp(&other.start2))
.then_with(|| self.strand2.cmp(&other.strand2))
.then_with(|| self.library.cmp(&other.library))
.then_with(|| self.cell_barcode.cmp(&other.cell_barcode))
}
}
#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
fn get_unclipped_position(record: &sam::alignment::RecordBuf) -> Result<i32> {
if record.flags().is_unmapped() {
return Ok(0);
}
unclipped_five_prime_position(record).map(|pos| pos as i32).ok_or_else(|| {
let read_name =
record.name().map_or_else(|| "unknown".into(), |n| String::from_utf8_lossy(n.as_ref()));
anyhow::anyhow!("Mapped read '{read_name}' missing alignment start position")
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_read_info_ordering() {
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1, info2);
}
#[test]
fn test_read_info_normalizes_order() {
let info1 = ReadInfo::new(0, 200, 1, 0, 100, 0, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1.ref_index1, 0);
assert_eq!(info1.start1, 100);
assert_eq!(info1.strand1, 0);
assert_eq!(info1.ref_index2, 0);
assert_eq!(info1.start2, 200);
assert_eq!(info1.strand2, 1);
assert_eq!(info1, info2);
}
#[test]
fn test_read_info_unmapped() {
let info = ReadInfo::unmapped(Arc::from("lib1"), None);
assert_eq!(info.ref_index1, ReadInfo::UNKNOWN_REF);
assert_eq!(info.start1, ReadInfo::UNKNOWN_POS);
}
#[test]
fn test_read_info_single() {
let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
assert_eq!(info.ref_index1, 1);
assert_eq!(info.start1, 500);
assert_eq!(info.strand1, 0);
assert_eq!(info.ref_index2, ReadInfo::UNKNOWN_REF);
}
#[test]
fn test_strand_to_byte() {
assert_eq!(ReadInfo::strand_to_byte(true), 0);
assert_eq!(ReadInfo::strand_to_byte(false), 1);
}
#[test]
fn test_read_info_comparison() {
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(0, 150, 0, 0, 200, 1, Arc::from("lib1"), None);
assert!(info1 < info2);
}
#[test]
fn test_different_chromosomes() {
let info1 = ReadInfo::new(0, 100, 0, 1, 200, 1, Arc::from("lib1"), None);
let info2 = ReadInfo::new(1, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
assert_eq!(info1.ref_index1, 0);
assert_eq!(info1.start1, 100);
assert_eq!(info2.ref_index1, 0);
assert_eq!(info2.start1, 200);
assert_ne!(info1, info2);
}
#[test]
fn test_build_library_lookup() {
use bstr::BString;
use noodles::sam::header::record::value::Map;
use noodles::sam::header::record::value::map::ReadGroup;
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
let mut header = Header::builder();
let rg1 = Map::<ReadGroup>::builder()
.insert(rg_tag::LIBRARY, String::from("libA"))
.build()
.expect("building read group with library should succeed");
header = header.add_read_group(BString::from("RG1"), rg1);
let rg2 = Map::<ReadGroup>::builder()
.insert(rg_tag::LIBRARY, String::from("libB"))
.build()
.expect("building read group with library should succeed");
header = header.add_read_group(BString::from("RG2"), rg2);
let rg3 = Map::<ReadGroup>::builder().build().expect("build should succeed");
header = header.add_read_group(BString::from("RG3"), rg3);
let header = header.build();
let lookup = super::build_library_lookup(&header);
assert_eq!(lookup.len(), 3);
assert_eq!(lookup.get("RG1").map(AsRef::as_ref), Some("libA"));
assert_eq!(lookup.get("RG2").map(AsRef::as_ref), Some("libB"));
assert_eq!(lookup.get("RG3").map(AsRef::as_ref), Some("unknown"));
}
#[test]
fn test_single_uses_unknown_values() {
let info = ReadInfo::single(1, 500, 0, Arc::from("lib1"), None);
assert_eq!(info.ref_index1, 1);
assert_eq!(info.start1, 500);
assert_eq!(info.strand1, 0);
assert_eq!(info.ref_index2, i32::MAX);
assert_eq!(info.start2, i32::MAX);
assert_eq!(info.strand2, u8::MAX);
}
#[test]
fn test_single_sorts_after_paired() {
let paired = ReadInfo::new(0, 100, 0, 0, 200, 1, Arc::from("lib1"), None);
let single = ReadInfo::single(0, 100, 0, Arc::from("lib1"), None);
assert!(single > paired);
}
#[test]
fn test_ord_includes_cell_barcode() {
let lib: Arc<str> = Arc::from("lib1");
let info1 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell1".to_string()));
let info2 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib.clone(), Some("cell2".to_string()));
let info3 = ReadInfo::new(0, 100, 0, 0, 200, 1, lib, None);
assert!(info1 < info2);
assert!(info3 < info1);
assert_ne!(info1, info2);
assert_ne!(info1, info3);
}
#[test]
fn test_normalization_with_strand_tiebreaker() {
let info1 = ReadInfo::new(0, 100, 1, 0, 100, 0, Arc::from("lib1"), None);
assert_eq!(info1.strand1, 0);
assert_eq!(info1.strand2, 1);
}
mod get_unclipped_position_tests {
use super::*;
use crate::sam::builder::RecordBuilder;
use noodles::sam::alignment::RecordBuf;
fn build_record(alignment_start: usize, cigar: &str, reverse: bool) -> RecordBuf {
RecordBuilder::new()
.sequence("ACGT") .alignment_start(alignment_start)
.cigar(cigar)
.reverse_complement(reverse)
.build()
}
#[test]
fn test_forward_strand_soft_clip_only() {
let record = build_record(100, "5S50M", false);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 95);
}
#[test]
fn test_forward_strand_hard_clip_only() {
let record = build_record(100, "10H50M", false);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 90);
}
#[test]
fn test_forward_strand_hard_and_soft_clip() {
let record = build_record(100, "10H5S50M", false);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 85);
}
#[test]
fn test_reverse_strand_soft_clip_only() {
let record = build_record(100, "50M5S", true);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 154);
}
#[test]
fn test_reverse_strand_hard_clip_only() {
let record = build_record(100, "50M10H", true);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 159);
}
#[test]
fn test_reverse_strand_hard_and_soft_clip() {
let record = build_record(100, "50M5S10H", true);
let pos =
get_unclipped_position(&record).expect("get_unclipped_position should succeed");
assert_eq!(pos, 164);
}
#[test]
fn test_no_clipping() {
let forward_record = build_record(100, "50M", false);
assert_eq!(
get_unclipped_position(&forward_record)
.expect("get_unclipped_position should succeed"),
100
);
let reverse_record = build_record(100, "50M", true);
assert_eq!(
get_unclipped_position(&reverse_record)
.expect("get_unclipped_position should succeed"),
149
);
}
#[test]
fn test_unmapped_read() {
let record = RecordBuilder::new().sequence("ACGT").unmapped(true).build();
assert_eq!(
get_unclipped_position(&record).expect("get_unclipped_position should succeed"),
0
);
}
#[test]
fn test_complex_cigar_with_insertions_deletions() {
let forward_record = build_record(100, "5H3S10M2I5M3D10M4S6H", false);
assert_eq!(
get_unclipped_position(&forward_record)
.expect("get_unclipped_position should succeed"),
92
);
let reverse_record = build_record(100, "5H3S10M2I5M3D10M4S6H", true);
assert_eq!(
get_unclipped_position(&reverse_record)
.expect("get_unclipped_position should succeed"),
137
);
}
}
}