use crate::{
consistent_genes::{groubygene, CUGset, Ec2GeneMapper},
io::{BusRecord, CUGIterator},
};
pub struct CbUmiGroup<I: CUGIterator> {
iter: I,
last_record: Option<BusRecord>, }
impl<I> Iterator for CbUmiGroup<I>
where I: CUGIterator,
{
type Item = ((u64, u64), Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
let mut busrecords: Vec<BusRecord> = Vec::new();
loop {
if let Some(new_record) = self.iter.next() {
let (new_cb, new_umi) = (new_record.CB, new_record.UMI);
let last_record =
std::mem::replace(&mut self.last_record, Some(new_record)).unwrap();
let (current_cb, current_umi) = (last_record.CB, last_record.UMI);
busrecords.push(last_record);
if new_cb > current_cb || (new_cb == current_cb && new_umi > current_umi) {
return Some(((current_cb, current_umi), busrecords));
} else if new_cb == current_cb && new_umi == current_umi {
} else {
panic!(
"Unsorted busfile: {}/{} -> {}/{}",
current_cb, current_umi, new_cb, new_umi
)
}
} else {
let last_record = self.last_record.take(); if let Some(r) = last_record {
let current_cb = r.CB;
let current_umi = r.UMI;
busrecords.push(r);
return Some(((current_cb, current_umi), busrecords));
} else {
return None;
}
}
}
}
}
impl<I> CbUmiGroup<I>
where I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next(); Self { iter, last_record }
}
}
pub struct CbUmiGroupFaster<I: CUGIterator> {
iter: I,
current_records: Vec<BusRecord>, current_cbumi: (u64, u64),
}
impl<I> CbUmiGroupFaster<I>
where I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next().expect("min one item needed"); let current_cbumi = (last_record.CB, last_record.UMI);
let current_records = vec![last_record];
Self { iter, current_records, current_cbumi }
}
}
impl<I> Iterator for CbUmiGroupFaster<I>
where I: CUGIterator,
{
type Item = ((u64, u64), Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(new_record) = self.iter.next() {
let new_cbumi = (new_record.CB, new_record.UMI);
match new_cbumi.cmp(&self.current_cbumi){
std::cmp::Ordering::Equal => { self.current_records.push(new_record); }, std::cmp::Ordering::Less => {panic!("Unsorted busfile: {:?} -> {:?}", self.current_cbumi, new_cbumi)},
std::cmp::Ordering::Greater => {
let new_records = vec![new_record];
let to_emit = std::mem::replace(&mut self.current_records, new_records);
let current_cbumi = std::mem::replace(&mut self.current_cbumi, new_cbumi);
return Some((current_cbumi, to_emit));
},
}
} else {
let to_emit = std::mem::take(&mut self.current_records);
let result = if !to_emit.is_empty() {
let current_cbumi = std::mem::take(&mut self.current_cbumi);
Some((current_cbumi, to_emit))
} else {
None };
return result
}
}
}
}
pub trait CbUmiGroupIterator: CUGIterator + Sized {
fn groupby_cbumi(self) -> CbUmiGroupFaster<Self> {
CbUmiGroupFaster::new(self)
}
}
impl<I: CUGIterator> CbUmiGroupIterator for I {}
pub struct CellGroup<I: CUGIterator> {
iter: I,
last_record: Option<BusRecord>, }
impl<I> Iterator for CellGroup<I>
where I: CUGIterator,
{
type Item = (u64, Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
let mut busrecords: Vec<BusRecord> = Vec::new();
loop {
if let Some(new_record) = self.iter.next() {
let new_cb = new_record.CB;
let last_record =
std::mem::replace(&mut self.last_record, Some(new_record)).unwrap();
let current_cb = last_record.CB;
busrecords.push(last_record);
match new_cb.cmp(¤t_cb) {
std::cmp::Ordering::Equal => {} std::cmp::Ordering::Greater => {
return Some((current_cb, busrecords));
}
std::cmp::Ordering::Less => {
panic!("Unsorted busfile: {} -> {}", current_cb, new_cb)
}
}
} else {
let last_record = self.last_record.take();
if let Some(r) = last_record {
let current_cb = r.CB;
busrecords.push(r);
return Some((current_cb, busrecords));
} else {
return None;
}
}
}
}
}
impl<I> CellGroup<I>
where I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next(); Self { iter, last_record }
}
}
pub struct CellGroupFaster<I: CUGIterator> {
iter: I,
current_records: Vec<BusRecord>,
current_cb: u64,
}
impl<I> CellGroupFaster<I>
where I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next().expect("expected at least one value in iterator"); let current_cb = last_record.CB;
let current_records = vec![last_record];
Self { iter, current_records, current_cb}
}
}
impl<I> Iterator for CellGroupFaster<I>
where I: CUGIterator,
{
type Item = (u64, Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(new_record) = self.iter.next() {
let new_cb = new_record.CB;
match new_cb.cmp(&self.current_cb) {
std::cmp::Ordering::Equal => { self.current_records.push(new_record) }, std::cmp::Ordering::Less => { panic!("Unsorted busfile: {} -> {}", self.current_cb, new_cb) },
std::cmp::Ordering::Greater => {
let new_records = vec![new_record];
let to_emit = std::mem::replace(&mut self.current_records, new_records);
let current_cb = std::mem::replace(&mut self.current_cb, new_cb);
return Some((current_cb, to_emit));
},
}
} else {
let to_emit = std::mem::take(&mut self.current_records);
let result = if !to_emit.is_empty() {
let current_cb = std::mem::take(&mut self.current_cb);
Some((current_cb, to_emit))
} else {
None };
return result
}
}
}
}
pub trait CellGroupIterator: CUGIterator + Sized {
fn groupby_cb(self) -> CellGroupFaster<Self> {
CellGroupFaster::new(self)
}
}
impl<I: CUGIterator> CellGroupIterator for I {}
pub struct GroupbyGene<I> {
iter: I,
ecmapper: Ec2GeneMapper,
}
impl<I> Iterator for GroupbyGene<I>
where
I: Iterator<Item = Vec<BusRecord>>,
{
type Item = Vec<CUGset>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| groubygene(v, &self.ecmapper))
}
}
impl<I> GroupbyGene<I> {
pub fn new(iter: I, ecmapper: Ec2GeneMapper) -> Self {
Self { iter, ecmapper }
}
}
pub trait GroupbyGeneIterator<T>: Iterator<Item = T> + Sized {
fn group_by_gene(self, ecmapper: Ec2GeneMapper) -> GroupbyGene<Self> {
GroupbyGene::new(self, ecmapper)
}
}
impl<T, I: Iterator<Item = T>> GroupbyGeneIterator<T> for I {}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use crate::consistent_genes::{Ec2GeneMapper, Genename, EC};
use crate::io::BusRecord;
use crate::iterators::{CbUmiGroupIterator, CellGroup, CellGroupIterator};
use crate::utils::vec2set;
#[test]
fn test_cb_single_elem() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let records = vec![r1.clone()];
let mut it = records.into_iter().groupby_cb();
assert_eq!(
it.next(),
Some((0, vec![r1]))
);
assert_eq!(
it.next(),
None
);
}
#[test]
fn test_cbumi_single_elem() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let records = vec![r1.clone()];
let mut it = records.into_iter().groupby_cbumi();
assert_eq!(
it.next(),
Some(((0,2), vec![r1]))
);
assert_eq!(
it.next(),
None
);
}
#[test]
fn test_cb_iter_last_element1() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let records = vec![r1.clone(), r2.clone(), r3.clone()];
let n: Vec<_> = records.clone().into_iter().groupby_cb().map(|(_cb, records)| records).collect();
assert_eq!(n.len(), 2);
let rlist = &n[1];
assert_eq!(rlist.len(), 1);
let iter = records.into_iter();
let n: Vec<_> = CellGroup::new(iter).map(|(_cb, records)| records).collect();
assert_eq!(n.len(), 2);
let rlist = &n[1];
assert_eq!(rlist.len(), 1);
}
#[test]
fn test_cb_iter_last_element2() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r4 = BusRecord { CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let records = vec![r1.clone(), r2.clone(), r3.clone(), r4.clone()];
let n: Vec<_> = records.into_iter().groupby_cb().collect();
assert_eq!(n.len(), 2);
let (_cb, rlist) = &n[1];
assert_eq!(rlist.len(), 2);
}
#[test]
fn test_cb_iter() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r4 = BusRecord { CB: 2, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let r5 = BusRecord { CB: 2, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r6 = BusRecord { CB: 3, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let records = vec![
r1.clone(),
r2.clone(),
r3.clone(),
r4.clone(),
r5.clone(),
r6.clone(),
];
let n: Vec<(u64, Vec<BusRecord>)> = records.into_iter().groupby_cb().collect();
assert_eq!(n.len(), 4);
let c1 = &n[0];
assert_eq!(*c1, (0, vec![r1, r2]));
let c2 = &n[1];
assert_eq!(*c2, (1, vec![r3]));
let c3 = &n[2];
assert_eq!(*c3, (2, vec![r4, r5]));
let c4 = &n[3];
assert_eq!(*c4, (3, vec![r6]));
}
#[test]
fn test_cbumi_iter() {
let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r4 = BusRecord { CB: 1, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let r5 = BusRecord { CB: 1, UMI: 2, EC: 1, COUNT: 2, FLAG: 0 };
let r6 = BusRecord { CB: 2, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let records = vec![
r1.clone(),
r2.clone(),
r3.clone(),
r4.clone(),
r5.clone(),
r6.clone(),
];
let cb_iter = records.into_iter().groupby_cbumi();
let n: Vec<((u64, u64), Vec<BusRecord>)> = cb_iter.collect();
assert_eq!(n.len(), 5);
let c1 = &n[0];
assert_eq!(*c1, ((0, 1), vec![r1, r2]));
let c2 = &n[1];
assert_eq!(*c2, ((0, 2), vec![r3]));
let c3 = &n[2];
assert_eq!(*c3, ((1, 1), vec![r4]));
let c4 = &n[3];
assert_eq!(*c4, ((1, 2), vec![r5]));
let c5 = &n[4];
assert_eq!(*c5, ((2, 1), vec![r6]));
}
#[test]
#[should_panic(expected = "Unsorted busfile: 2 -> 0")]
fn test_panic_on_unsorted() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 2, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r4 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let records = vec![r1, r2, r3, r4];
records.into_iter().groupby_cb().count();
}
#[test]
#[should_panic(expected = "Unsorted busfile: (2, 2) -> (0, 1)")]
fn test_panic_on_unsorted_cbumi() {
let r1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 21, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 2, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
let r4 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let records = vec![r1, r2, r3, r4];
records.into_iter().groupby_cbumi().count();
}
use crate::iterators::GroupbyGeneIterator;
#[test]
fn test_groupby_genes() {
let ec0: HashSet<Genename> = vec2set(vec![Genename("A".to_string())]);
let ec1: HashSet<Genename> = vec2set(vec![Genename("B".to_string())]);
let ec2: HashSet<Genename> =
vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
let ec3: HashSet<Genename> =
vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0 };
let s1 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 3, FLAG: 0 }; let s2 = BusRecord { CB: 0, UMI: 2, EC: 1, COUNT: 4, FLAG: 0 };
let records = vec![r1.clone(), r2.clone(), s1.clone(), s2.clone()];
let cb_iter = records.into_iter().groupby_cbumi();
let results: Vec<_> = cb_iter.map(|(_cbumi, r)| r).group_by_gene(es).collect();
assert_eq!(results.len(), 2);
assert_eq!(results[0].len(), 1);
assert_eq!(results[1].len(), 2);
println!("{:?}", results)
}
}