use std::collections::VecDeque;
use std::sync::atomic::{AtomicUsize, Ordering};
use ahash::{AHashMap, AHashSet};
use rayon::prelude::*;
use crate::bitenc::BitEnc;
use crate::template::MoleculeId;
use super::{Umi, UmiAssigner};
pub struct UnionFind {
parent: Vec<AtomicUsize>,
rank: Vec<AtomicUsize>,
}
impl UnionFind {
pub fn new(n: usize) -> Self {
Self {
parent: (0..n).map(AtomicUsize::new).collect(),
rank: (0..n).map(|_| AtomicUsize::new(0)).collect(),
}
}
#[must_use]
pub fn find(&self, x: usize) -> usize {
let mut current = x;
loop {
let parent = self.parent[current].load(Ordering::Acquire);
if parent == current {
return current;
}
let grandparent = self.parent[parent].load(Ordering::Acquire);
let _ = self.parent[current].compare_exchange(
parent,
grandparent,
Ordering::Release,
Ordering::Relaxed,
);
current = parent;
}
}
#[must_use]
pub fn union(&self, x: usize, y: usize) -> bool {
loop {
let root_x = self.find(x);
let root_y = self.find(y);
if root_x == root_y {
return false;
}
let rank_x = self.rank[root_x].load(Ordering::Acquire);
let rank_y = self.rank[root_y].load(Ordering::Acquire);
let (smaller, larger) =
if rank_x < rank_y { (root_x, root_y) } else { (root_y, root_x) };
if self.parent[smaller]
.compare_exchange(smaller, larger, Ordering::Release, Ordering::Relaxed)
.is_ok()
{
if rank_x == rank_y {
let _ = self.rank[larger].compare_exchange(
rank_x,
rank_x + 1,
Ordering::Release,
Ordering::Relaxed,
);
}
return true;
}
}
}
pub fn union_parallel(&self, edges: &[(usize, usize)]) {
edges.par_iter().for_each(|&(i, j)| {
let _ = self.union(i, j);
});
}
}
#[inline]
fn generate_neighbors(umi: &BitEnc) -> impl Iterator<Item = BitEnc> + '_ {
(0..umi.len()).flat_map(move |pos| {
let current_base = umi.base_at(pos);
(0..4u8)
.filter(move |&base| base != current_base)
.map(move |base| umi.with_base_at(pos, base))
})
}
fn generate_neighbors_k(umi: &BitEnc, k: u32) -> Vec<BitEnc> {
if k == 0 {
return vec![*umi];
}
if k == 1 {
return generate_neighbors(umi).collect();
}
let mut result = AHashSet::new();
generate_neighbors_k_recursive(umi, k, 0, &mut result);
result.into_iter().collect()
}
fn generate_neighbors_k_recursive(
umi: &BitEnc,
remaining_edits: u32,
start_pos: usize,
result: &mut AHashSet<BitEnc>,
) {
if remaining_edits == 0 {
result.insert(*umi);
return;
}
result.insert(*umi);
for pos in start_pos..umi.len() {
let current_base = umi.base_at(pos);
for new_base in 0..4u8 {
if new_base != current_base {
let mutated = umi.with_base_at(pos, new_base);
generate_neighbors_k_recursive(&mutated, remaining_edits - 1, pos + 1, result);
}
}
}
}
#[must_use]
pub fn discover_edges_parallel_k1(
umis: &[(BitEnc, usize)], ) -> Vec<(usize, usize)> {
let umi_to_idx: AHashMap<BitEnc, usize> =
umis.iter().enumerate().map(|(i, (enc, _))| (*enc, i)).collect();
umis.par_iter()
.enumerate()
.flat_map(|(i, (enc, _))| {
let mut edges = Vec::new();
for neighbor in generate_neighbors(enc) {
if let Some(&j) = umi_to_idx.get(&neighbor) {
if i < j {
edges.push((i, j));
}
}
}
edges
})
.collect()
}
#[must_use]
pub fn discover_edges_parallel_k(
umis: &[(BitEnc, usize)],
max_mismatches: u32,
) -> Vec<(usize, usize)> {
if max_mismatches == 1 {
return discover_edges_parallel_k1(umis);
}
let umi_to_idx: AHashMap<BitEnc, usize> =
umis.iter().enumerate().map(|(i, (enc, _))| (*enc, i)).collect();
umis.par_iter()
.enumerate()
.flat_map(|(i, (enc, _))| {
let mut edges = Vec::new();
let neighbors = generate_neighbors_k(enc, max_mismatches);
for neighbor in neighbors {
if let Some(&j) = umi_to_idx.get(&neighbor) {
if i < j {
if enc.hamming_distance(&neighbor) <= max_mismatches {
edges.push((i, j));
}
}
}
}
edges
})
.collect()
}
pub struct ParallelIdentityAssigner {
pool: rayon::ThreadPool,
}
impl ParallelIdentityAssigner {
#[must_use]
pub fn new(threads: usize) -> Self {
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.expect("Failed to build rayon thread pool");
Self { pool }
}
}
impl UmiAssigner for ParallelIdentityAssigner {
fn assign(&self, raw_umis: &[Umi]) -> Vec<MoleculeId> {
if raw_umis.is_empty() {
return Vec::new();
}
let canonicals: Vec<String> =
self.pool.install(|| raw_umis.par_iter().map(|umi| umi.to_uppercase()).collect());
let chunk_size = (canonicals.len() / self.pool.current_num_threads().max(1)).max(1);
let per_chunk_sets: Vec<AHashSet<&str>> = self.pool.install(|| {
canonicals
.par_chunks(chunk_size)
.map(|chunk| chunk.iter().map(String::as_str).collect::<AHashSet<&str>>())
.collect()
});
let mut unique_canonicals: Vec<&str> = Vec::new();
let mut seen = AHashSet::new();
for chunk_set in &per_chunk_sets {
for &umi in chunk_set {
if seen.insert(umi) {
unique_canonicals.push(umi);
}
}
}
unique_canonicals.sort_unstable();
let canonical_to_id: AHashMap<&str, MoleculeId> = unique_canonicals
.into_iter()
.enumerate()
.map(|(i, umi)| (umi, MoleculeId::Single(i as u64)))
.collect();
self.pool.install(|| {
canonicals.par_iter().map(|canonical| canonical_to_id[canonical.as_str()]).collect()
})
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
pub struct ParallelEditAssigner {
max_mismatches: u32,
pool: rayon::ThreadPool,
}
impl ParallelEditAssigner {
#[must_use]
pub fn new(max_mismatches: u32, threads: usize) -> Self {
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.expect("Failed to build rayon thread pool");
Self { max_mismatches, pool }
}
}
impl UmiAssigner for ParallelEditAssigner {
fn assign(&self, raw_umis: &[Umi]) -> Vec<MoleculeId> {
if raw_umis.is_empty() {
return Vec::new();
}
let mut umi_counts: AHashMap<BitEnc, usize> = AHashMap::new();
let mut umi_to_original: Vec<Option<BitEnc>> = Vec::with_capacity(raw_umis.len());
for umi in raw_umis {
let umi_upper = umi.to_uppercase();
if let Some(enc) = BitEnc::from_umi_str(&umi_upper) {
*umi_counts.entry(enc).or_insert(0) += 1;
umi_to_original.push(Some(enc));
} else {
umi_to_original.push(None);
}
}
if umi_counts.is_empty() {
return (0..raw_umis.len()).map(|i| MoleculeId::Single(i as u64)).collect();
}
let unique_umis: Vec<(BitEnc, usize)> = umi_counts.into_iter().collect();
let enc_to_idx: AHashMap<BitEnc, usize> =
unique_umis.iter().enumerate().map(|(i, (enc, _))| (*enc, i)).collect();
let max_mismatches = self.max_mismatches;
let (edges, uf) = self.pool.install(|| {
let edges = discover_edges_parallel_k(&unique_umis, max_mismatches);
let uf = UnionFind::new(unique_umis.len());
uf.union_parallel(&edges);
(edges, uf)
});
drop(edges);
let mut root_to_mol: AHashMap<usize, MoleculeId> = AHashMap::new();
let mut next_mol_id: u64 = 0;
let mut result = Vec::with_capacity(raw_umis.len());
for enc_opt in &umi_to_original {
let mol_id = if let Some(enc) = enc_opt {
let idx = enc_to_idx[enc];
let root = uf.find(idx);
*root_to_mol.entry(root).or_insert_with(|| {
let id = MoleculeId::Single(next_mol_id);
next_mol_id += 1;
id
})
} else {
let id = MoleculeId::Single(next_mol_id);
next_mol_id += 1;
id
};
result.push(mol_id);
}
result
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
pub struct ParallelAdjacencyAssigner {
max_mismatches: u32,
pool: rayon::ThreadPool,
}
impl ParallelAdjacencyAssigner {
#[must_use]
pub fn new(max_mismatches: u32, threads: usize) -> Self {
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.expect("Failed to build rayon thread pool");
Self { max_mismatches, pool }
}
}
impl UmiAssigner for ParallelAdjacencyAssigner {
fn assign(&self, raw_umis: &[Umi]) -> Vec<MoleculeId> {
if raw_umis.is_empty() {
return Vec::new();
}
let mut umi_counts: AHashMap<String, usize> = AHashMap::new();
for umi in raw_umis {
*umi_counts.entry(umi.to_uppercase()).or_insert(0) += 1;
}
let mut sorted_umis: Vec<(String, usize, BitEnc)> = umi_counts
.iter()
.filter_map(|(umi, &count)| {
BitEnc::from_umi_str(umi).map(|enc| (umi.clone(), count, enc))
})
.collect();
if sorted_umis.is_empty() {
return (0..raw_umis.len()).map(|i| MoleculeId::Single(i as u64)).collect();
}
sorted_umis.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
let unique_umis: Vec<(BitEnc, usize)> =
sorted_umis.iter().map(|(_, count, enc)| (*enc, *count)).collect();
let max_mismatches = self.max_mismatches;
let edges = self.pool.install(|| discover_edges_parallel_k(&unique_umis, max_mismatches));
let mut adj_list: Vec<Vec<usize>> = vec![Vec::new(); unique_umis.len()];
for (i, j) in edges {
adj_list[i].push(j);
adj_list[j].push(i);
}
let mut assigned = vec![false; unique_umis.len()];
let mut mol_ids: Vec<MoleculeId> = vec![MoleculeId::None; unique_umis.len()];
let mut next_mol_id: u64 = 0;
for root_idx in 0..unique_umis.len() {
if assigned[root_idx] {
continue;
}
let mol_id = MoleculeId::Single(next_mol_id);
next_mol_id += 1;
let mut queue = VecDeque::new();
queue.push_back(root_idx);
assigned[root_idx] = true;
mol_ids[root_idx] = mol_id;
while let Some(idx) = queue.pop_front() {
let parent_count = unique_umis[idx].1;
let max_child_count = parent_count / 2 + 1;
for &neighbor_idx in &adj_list[idx] {
if !assigned[neighbor_idx] {
let neighbor_count = unique_umis[neighbor_idx].1;
if neighbor_count <= max_child_count {
assigned[neighbor_idx] = true;
mol_ids[neighbor_idx] = mol_id;
queue.push_back(neighbor_idx);
}
}
}
}
}
let str_to_mol: AHashMap<&str, MoleculeId> = sorted_umis
.iter()
.enumerate()
.map(|(i, (umi, _, _))| (umi.as_str(), mol_ids[i]))
.collect();
raw_umis
.iter()
.map(|umi| {
let upper = umi.to_uppercase();
str_to_mol.get(upper.as_str()).copied().unwrap_or_else(|| {
let id = MoleculeId::Single(next_mol_id);
next_mol_id += 1;
id
})
})
.collect()
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
pub struct ParallelPairedAssigner {
max_mismatches: u32,
pool: rayon::ThreadPool,
}
impl ParallelPairedAssigner {
#[must_use]
pub fn new(max_mismatches: u32, threads: usize) -> Self {
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.expect("Failed to build rayon thread pool");
Self { max_mismatches, pool }
}
fn reverse_paired(umi: &str) -> Option<String> {
let parts: Vec<&str> = umi.split('-').collect();
if parts.len() == 2 { Some(format!("{}-{}", parts[1], parts[0])) } else { None }
}
fn canonicalize(umi: &str) -> String {
let upper = umi.to_uppercase();
if let Some(reversed) = Self::reverse_paired(&upper) {
if reversed < upper { reversed } else { upper }
} else {
upper
}
}
fn matches_canonical(umi: &str, canonical: &str) -> bool {
umi.to_uppercase() == canonical
}
}
impl UmiAssigner for ParallelPairedAssigner {
fn assign(&self, raw_umis: &[Umi]) -> Vec<MoleculeId> {
if raw_umis.is_empty() {
return Vec::new();
}
for umi in raw_umis {
assert!(umi.split('-').count() == 2, "UMI {umi} is not a paired UMI");
}
let mut canonical_counts: AHashMap<String, usize> = AHashMap::new();
for umi in raw_umis {
let canonical = Self::canonicalize(umi);
*canonical_counts.entry(canonical).or_insert(0) += 1;
}
let mut sorted_umis: Vec<(String, usize, BitEnc)> = canonical_counts
.iter()
.filter_map(|(umi, &count)| {
BitEnc::from_umi_str(umi).map(|enc| (umi.clone(), count, enc))
})
.collect();
if sorted_umis.is_empty() {
return (0..raw_umis.len()).map(|i| MoleculeId::Single(i as u64)).collect();
}
sorted_umis.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
let unique_umis: Vec<(BitEnc, usize)> =
sorted_umis.iter().map(|(_, count, enc)| (*enc, *count)).collect();
let max_mismatches = self.max_mismatches;
let edges = self.pool.install(|| discover_edges_parallel_k(&unique_umis, max_mismatches));
let mut adj_list: Vec<Vec<usize>> = vec![Vec::new(); unique_umis.len()];
for (i, j) in edges {
adj_list[i].push(j);
adj_list[j].push(i);
}
let mut assigned = vec![false; unique_umis.len()];
let mut mol_ids: Vec<u64> = vec![0; unique_umis.len()];
let mut next_mol_id: u64 = 0;
for root_idx in 0..unique_umis.len() {
if assigned[root_idx] {
continue;
}
let mol_id = next_mol_id;
next_mol_id += 1;
let mut queue = VecDeque::new();
queue.push_back(root_idx);
assigned[root_idx] = true;
mol_ids[root_idx] = mol_id;
while let Some(idx) = queue.pop_front() {
let parent_count = unique_umis[idx].1;
let max_child_count = parent_count / 2 + 1;
for &neighbor_idx in &adj_list[idx] {
if !assigned[neighbor_idx] {
let neighbor_count = unique_umis[neighbor_idx].1;
if neighbor_count <= max_child_count {
assigned[neighbor_idx] = true;
mol_ids[neighbor_idx] = mol_id;
queue.push_back(neighbor_idx);
}
}
}
}
}
let canonical_to_mol: AHashMap<&str, u64> = sorted_umis
.iter()
.enumerate()
.map(|(i, (umi, _, _))| (umi.as_str(), mol_ids[i]))
.collect();
raw_umis
.iter()
.map(|umi| {
let canonical = Self::canonicalize(umi);
if let Some(&base_id) = canonical_to_mol.get(canonical.as_str()) {
if Self::matches_canonical(umi, &canonical) {
MoleculeId::PairedA(base_id)
} else {
MoleculeId::PairedB(base_id)
}
} else {
MoleculeId::None
}
})
.collect()
}
fn split_templates_by_pair_orientation(&self) -> bool {
false
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_union_find_basic() {
let uf = UnionFind::new(5);
for i in 0..5 {
assert_eq!(uf.find(i), i);
}
assert!(uf.union(0, 1));
assert_eq!(uf.find(0), uf.find(1));
assert!(!uf.union(0, 1));
assert!(uf.union(2, 3));
assert_eq!(uf.find(2), uf.find(3));
assert_ne!(uf.find(0), uf.find(2));
assert!(uf.union(1, 3));
assert_eq!(uf.find(0), uf.find(3));
}
#[test]
fn test_union_find_large() {
let uf = UnionFind::new(1000);
for i in 0..999 {
let _ = uf.union(i, i + 1);
}
let root = uf.find(0);
for i in 1..1000 {
assert_eq!(uf.find(i), root);
}
}
#[test]
fn test_union_find_parallel() {
let uf = UnionFind::new(100);
let edges: Vec<(usize, usize)> = (0..99).map(|i| (i, i + 1)).collect();
uf.union_parallel(&edges);
let root = uf.find(0);
for i in 1..100 {
assert_eq!(uf.find(i), root);
}
}
#[test]
fn test_generate_neighbors() {
let umi = BitEnc::from_bytes(b"AA").expect("valid DNA sequence");
let neighbors: Vec<_> = generate_neighbors(&umi).collect();
assert_eq!(neighbors.len(), 6);
assert!(neighbors.contains(&BitEnc::from_bytes(b"CA").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"GA").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"TA").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"AC").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"AG").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"AT").expect("valid DNA sequence")));
}
#[test]
fn test_generate_neighbors_k2() {
let umi = BitEnc::from_bytes(b"AA").expect("valid DNA sequence");
let neighbors = generate_neighbors_k(&umi, 2);
assert_eq!(neighbors.len(), 16);
assert!(neighbors.contains(&umi));
assert!(neighbors.contains(&BitEnc::from_bytes(b"CA").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"CC").expect("valid DNA sequence")));
assert!(neighbors.contains(&BitEnc::from_bytes(b"TT").expect("valid DNA sequence")));
}
#[test]
fn test_discover_edges_parallel_k1() {
let umis = vec![
(BitEnc::from_bytes(b"AAAA").expect("valid DNA sequence"), 10),
(BitEnc::from_bytes(b"AAAT").expect("valid DNA sequence"), 5), (BitEnc::from_bytes(b"TTTT").expect("valid DNA sequence"), 3), ];
let edges = discover_edges_parallel_k1(&umis);
assert_eq!(edges.len(), 1);
assert!(edges.contains(&(0, 1)));
}
#[test]
fn test_discover_edges_parallel_k2() {
let umis = vec![
(BitEnc::from_bytes(b"AAAA").expect("valid DNA sequence"), 10),
(BitEnc::from_bytes(b"AATT").expect("valid DNA sequence"), 5), (BitEnc::from_bytes(b"TTTT").expect("valid DNA sequence"), 3), ];
let edges = discover_edges_parallel_k(&umis, 2);
assert_eq!(edges.len(), 2);
assert!(edges.contains(&(0, 1)));
assert!(edges.contains(&(1, 2)));
}
#[test]
fn test_discover_edges_parallel_chain() {
let umis = vec![
(BitEnc::from_bytes(b"AAAA").expect("valid DNA sequence"), 10),
(BitEnc::from_bytes(b"AAAT").expect("valid DNA sequence"), 5), (BitEnc::from_bytes(b"AATT").expect("valid DNA sequence"), 3), ];
let edges = discover_edges_parallel_k1(&umis);
assert_eq!(edges.len(), 2);
assert!(edges.contains(&(0, 1)));
assert!(edges.contains(&(1, 2)));
}
#[test]
fn test_parallel_identity_basic() {
let assigner = ParallelIdentityAssigner::new(2);
let umis: Vec<String> =
vec!["AAAA", "AAAA", "TTTT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_ne!(assignments[0], assignments[2]);
}
#[test]
fn test_parallel_identity_case_insensitive() {
let assigner = ParallelIdentityAssigner::new(2);
let umis: Vec<String> =
vec!["ACGT", "acgt", "AcGt"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[1], assignments[2]);
}
#[test]
fn test_parallel_identity_empty() {
let assigner = ParallelIdentityAssigner::new(2);
let umis: Vec<String> = vec![];
let assignments = assigner.assign(&umis);
assert!(assignments.is_empty());
}
#[test]
fn test_parallel_identity_single() {
let assigner = ParallelIdentityAssigner::new(2);
let umis: Vec<String> = vec!["ACGT".to_string()];
let assignments = assigner.assign(&umis);
assert_eq!(assignments.len(), 1);
}
#[test]
fn test_parallel_identity_deterministic() {
let umis: Vec<String> = vec!["CCCC", "AAAA", "TTTT", "GGGG", "AAAA", "CCCC"]
.into_iter()
.map(String::from)
.collect();
let assigner1 = ParallelIdentityAssigner::new(2);
let assigner2 = ParallelIdentityAssigner::new(4);
let result1 = assigner1.assign(&umis);
let result2 = assigner2.assign(&umis);
assert_same_groupings(&result1, &result2);
}
#[test]
fn test_parallel_identity_many_duplicates() {
let assigner = ParallelIdentityAssigner::new(4);
let mut umis: Vec<String> = Vec::new();
umis.extend(std::iter::repeat_n("AAAAAA".to_string(), 100));
umis.extend(std::iter::repeat_n("TTTTTT".to_string(), 100));
umis.extend(std::iter::repeat_n("CCCCCC".to_string(), 100));
let assignments = assigner.assign(&umis);
for i in 1..100 {
assert_eq!(assignments[0], assignments[i]);
}
for i in 101..200 {
assert_eq!(assignments[100], assignments[i]);
}
assert_ne!(assignments[0], assignments[100]);
assert_ne!(assignments[0], assignments[200]);
assert_ne!(assignments[100], assignments[200]);
}
#[test]
fn test_parallel_edit_basic() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> =
vec!["AAAA", "AAAT", "TTTT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_ne!(assignments[0], assignments[2]);
}
#[test]
fn test_parallel_edit_k2() {
let assigner = ParallelEditAssigner::new(2, 2);
let umis: Vec<String> =
vec!["AAAA", "AATT", "TTTT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[1], assignments[2]);
}
#[test]
fn test_parallel_edit_transitive() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> =
vec!["AAAA", "AAAT", "AATT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[1], assignments[2]);
}
#[test]
fn test_parallel_edit_empty() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> = vec![];
let assignments = assigner.assign(&umis);
assert!(assignments.is_empty());
}
#[test]
fn test_parallel_edit_single() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> = vec!["ACGT".to_string()];
let assignments = assigner.assign(&umis);
assert_eq!(assignments.len(), 1);
}
#[test]
fn test_parallel_edit_identical_umis() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> =
vec!["ACGT", "ACGT", "ACGT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[1], assignments[2]);
}
#[test]
fn test_parallel_edit_paired_umis_with_dash() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> = vec![
"ACGTACGT-TGCATGCA".to_string(), "ACGTACGT-TGCATGCA".to_string(),
"ACGTACGT-TGCATGCT".to_string(), "GGGGGGGG-CCCCCCCC".to_string(), ];
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1], "Identical paired UMIs should have same ID");
assert_eq!(assignments[0], assignments[2], "Paired UMIs within 1 edit should have same ID");
assert_ne!(
assignments[0], assignments[3],
"Different paired UMIs should have different IDs"
);
}
#[test]
fn test_parallel_adjacency_paired_umis_with_dash() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> = vec![
"ACGTACGT-TGCATGCA".to_string(),
"ACGTACGT-TGCATGCA".to_string(),
"ACGTACGT-TGCATGCT".to_string(), ];
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1], "Identical paired UMIs should have same ID");
}
#[test]
fn test_parallel_edit_case_insensitive() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> =
vec!["ACGT", "acgt", "AcGt"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[1], assignments[2]);
}
#[test]
fn test_parallel_edit_invalid_umis() {
let assigner = ParallelEditAssigner::new(1, 2);
let umis: Vec<String> = vec!["ACGT", "ACGN", "ACGT"] .into_iter()
.map(String::from)
.collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[2]);
assert_ne!(assignments[0], assignments[1]);
}
#[test]
fn test_parallel_adjacency_basic() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> =
vec!["AAAA", "AAAA", "AAAT"].into_iter().map(String::from).collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]); assert_eq!(assignments[0], assignments[2]); }
#[test]
fn test_parallel_adjacency_count_constraint() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> = vec![
"AAAA", "AAAA", "AAAA", "AAAA", "AAAT", "TTTT", "TTTT", "TTTT", "TTTT", ]
.into_iter()
.map(String::from)
.collect();
let assignments = assigner.assign(&umis);
assert_eq!(assignments[0], assignments[1]);
assert_eq!(assignments[0], assignments[2]);
assert_eq!(assignments[0], assignments[3]);
assert_eq!(assignments[0], assignments[4]);
assert_eq!(assignments[5], assignments[6]);
assert_ne!(assignments[0], assignments[5]);
}
#[test]
fn test_parallel_adjacency_cascade() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let mut umis: Vec<String> = Vec::new();
umis.extend(std::iter::repeat_n("AAAA".to_string(), 10));
umis.extend(std::iter::repeat_n("AAAT".to_string(), 4));
umis.extend(std::iter::repeat_n("AATT".to_string(), 1));
let assignments = assigner.assign(&umis);
let first_mol = assignments[0];
for assignment in &assignments {
assert_eq!(*assignment, first_mol);
}
}
#[test]
fn test_parallel_adjacency_deterministic_tiebreak() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> = vec![
"AAAAAA", "AAAAAA", "AAAGAC", "AAAGAC", "AAAAAC", ]
.into_iter()
.map(String::from)
.collect();
let assignments1 = assigner.assign(&umis);
let assignments2 = assigner.assign(&umis);
assert_eq!(assignments1, assignments2);
assert_eq!(assignments1[0], assignments1[4]);
}
#[test]
fn test_parallel_adjacency_empty() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> = vec![];
let assignments = assigner.assign(&umis);
assert!(assignments.is_empty());
}
#[test]
fn test_parallel_adjacency_single() {
let assigner = ParallelAdjacencyAssigner::new(1, 2);
let umis: Vec<String> = vec!["ACGT".to_string()];
let assignments = assigner.assign(&umis);
assert_eq!(assignments.len(), 1);
}
#[test]
fn test_parallel_paired_basic() {
let assigner = ParallelPairedAssigner::new(1, 2);
let umis: Vec<String> = vec![
"AAAA-CCCC".to_string(), "CCCC-AAAA".to_string(), ];
let assignments = assigner.assign(&umis);
match (&assignments[0], &assignments[1]) {
(MoleculeId::PairedA(a), MoleculeId::PairedB(b))
| (MoleculeId::PairedB(a), MoleculeId::PairedA(b)) => assert_eq!(a, b),
_ => unreachable!("Expected PairedA and PairedB, got {assignments:?}"),
}
}
#[test]
fn test_parallel_paired_with_errors() {
let assigner = ParallelPairedAssigner::new(1, 2);
let umis: Vec<String> = vec![
"AAAA-CCCC".to_string(),
"AAAA-CCCC".to_string(),
"AAAT-CCCC".to_string(), ];
let assignments = assigner.assign(&umis);
let base_id = match &assignments[0] {
MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => *id,
_ => unreachable!("Expected paired ID"),
};
for assignment in &assignments {
match assignment {
MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => {
assert_eq!(*id, base_id);
}
_ => unreachable!("Expected paired ID"),
}
}
}
#[test]
fn test_parallel_paired_canonicalization() {
assert_eq!(
ParallelPairedAssigner::canonicalize("AAAA-CCCC"),
"AAAA-CCCC" );
assert_eq!(
ParallelPairedAssigner::canonicalize("CCCC-AAAA"),
"AAAA-CCCC" );
assert_eq!(
ParallelPairedAssigner::canonicalize("TTTT-AAAA"),
"AAAA-TTTT" );
}
#[test]
fn test_parallel_paired_canonicalization_mixed_case() {
assert_eq!(ParallelPairedAssigner::canonicalize("aaaa-cccc"), "AAAA-CCCC");
assert_eq!(ParallelPairedAssigner::canonicalize("cccc-aaaa"), "AAAA-CCCC");
assert_eq!(ParallelPairedAssigner::canonicalize("Tttt-Aaaa"), "AAAA-TTTT");
assert_eq!(
ParallelPairedAssigner::canonicalize("aaaa-CCCC"),
ParallelPairedAssigner::canonicalize("AAAA-cccc")
);
}
#[test]
fn test_edit_equivalence_with_sequential() {
let parallel = ParallelEditAssigner::new(1, 4);
let umis: Vec<String> = vec!["AAAAAA", "AAAAAT", "AAAATT", "TTTTTT", "TTTTTA", "GGGGGG"]
.into_iter()
.map(String::from)
.collect();
let parallel_result = parallel.assign(&umis);
assert_eq!(parallel_result[0], parallel_result[1]);
assert_eq!(parallel_result[1], parallel_result[2]);
assert_eq!(parallel_result[3], parallel_result[4]);
assert_ne!(parallel_result[5], parallel_result[0]);
assert_ne!(parallel_result[5], parallel_result[3]);
assert_ne!(parallel_result[0], parallel_result[3]);
}
fn assert_same_groupings(a: &[MoleculeId], b: &[MoleculeId]) {
assert_eq!(a.len(), b.len());
let mut a_groups: AHashMap<MoleculeId, Vec<usize>> = AHashMap::new();
let mut b_groups: AHashMap<MoleculeId, Vec<usize>> = AHashMap::new();
for (i, (ma, mb)) in a.iter().zip(b.iter()).enumerate() {
a_groups.entry(*ma).or_default().push(i);
b_groups.entry(*mb).or_default().push(i);
}
let mut a_sorted: Vec<Vec<usize>> = a_groups.into_values().collect();
let mut b_sorted: Vec<Vec<usize>> = b_groups.into_values().collect();
for v in &mut a_sorted {
v.sort_unstable();
}
for v in &mut b_sorted {
v.sort_unstable();
}
a_sorted.sort();
b_sorted.sort();
assert_eq!(a_sorted, b_sorted, "Groupings do not match");
}
#[test]
fn test_parallel_edit_vs_parallel_edit_deterministic() {
let umis: Vec<String> = vec![
"AAAAAA", "AAAAAT", "AAAATT", "TTTTTT", "TTTTTA", "GGGGGG", "CCCCCC", "CCCCCG",
"CCCCGG",
]
.into_iter()
.map(String::from)
.collect();
let assigner1 = ParallelEditAssigner::new(1, 4);
let assigner2 = ParallelEditAssigner::new(1, 4);
let result1 = assigner1.assign(&umis);
let result2 = assigner2.assign(&umis);
assert_same_groupings(&result1, &result2);
}
#[test]
fn test_parallel_adjacency_vs_parallel_adjacency_deterministic() {
let mut umis: Vec<String> = Vec::new();
umis.extend(std::iter::repeat_n("AAAAAA".to_string(), 10));
umis.extend(std::iter::repeat_n("AAAAAT".to_string(), 3));
umis.extend(std::iter::repeat_n("TTTTTT".to_string(), 8));
umis.extend(std::iter::repeat_n("TTTTTA".to_string(), 2));
let assigner1 = ParallelAdjacencyAssigner::new(1, 4);
let assigner2 = ParallelAdjacencyAssigner::new(1, 4);
let result1 = assigner1.assign(&umis);
let result2 = assigner2.assign(&umis);
assert_same_groupings(&result1, &result2);
}
}