use crate::{
consistent_genes::{groubygene, CUGset, Ec2GeneMapper},
io::{BusRecord, CUGIterator},
};
pub struct CbUmiGroup<I: CUGIterator> {
iter: I,
last_record: Option<BusRecord>, }
impl<I> Iterator for CbUmiGroup<I>
where
I: CUGIterator,
{
type Item = ((u64, u64), Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
let mut busrecords: Vec<BusRecord> = Vec::new();
loop {
if let Some(new_record) = self.iter.next() {
let (new_cb, new_umi) = (new_record.CB, new_record.UMI);
let last_record =
self.last_record.replace(new_record).unwrap();
let (current_cb, current_umi) = (last_record.CB, last_record.UMI);
busrecords.push(last_record);
if new_cb > current_cb || (new_cb == current_cb && new_umi > current_umi) {
return Some(((current_cb, current_umi), busrecords));
} else if new_cb == current_cb && new_umi == current_umi {
} else {
panic!(
"Unsorted busfile: {}/{} -> {}/{}",
current_cb, current_umi, new_cb, new_umi
)
}
} else {
let last_record = self.last_record.take(); if let Some(r) = last_record {
let current_cb = r.CB;
let current_umi = r.UMI;
busrecords.push(r);
return Some(((current_cb, current_umi), busrecords));
} else {
return None;
}
}
}
}
}
impl<I> CbUmiGroup<I>
where
I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next(); Self { iter, last_record }
}
}
pub struct CbUmiGroupFaster<I: CUGIterator> {
iter: I,
current_records: Vec<BusRecord>, current_cbumi: (u64, u64),
}
impl<I> CbUmiGroupFaster<I>
where
I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next().expect("min one item needed"); let current_cbumi = (last_record.CB, last_record.UMI);
let current_records = vec![last_record];
Self { iter, current_records, current_cbumi }
}
}
impl<I> Iterator for CbUmiGroupFaster<I>
where
I: CUGIterator,
{
type Item = ((u64, u64), Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(new_record) = self.iter.next() {
let new_cbumi = (new_record.CB, new_record.UMI);
match new_cbumi.cmp(&self.current_cbumi) {
std::cmp::Ordering::Equal => {
self.current_records.push(new_record);
} std::cmp::Ordering::Less => {
panic!(
"Unsorted busfile: {:?} -> {:?}",
self.current_cbumi, new_cbumi
)
}
std::cmp::Ordering::Greater => {
let new_records = vec![new_record];
let to_emit = std::mem::replace(&mut self.current_records, new_records);
let current_cbumi = std::mem::replace(&mut self.current_cbumi, new_cbumi);
return Some((current_cbumi, to_emit));
}
}
} else {
let to_emit = std::mem::take(&mut self.current_records);
let result = if !to_emit.is_empty() {
let current_cbumi = std::mem::take(&mut self.current_cbumi);
Some((current_cbumi, to_emit))
} else {
None };
return result;
}
}
}
}
pub trait CbUmiGroupIterator: CUGIterator + Sized {
fn groupby_cbumi(self) -> CbUmiGroupFaster<Self> {
CbUmiGroupFaster::new(self)
}
}
impl<I: CUGIterator> CbUmiGroupIterator for I {}
pub struct CellGroup<I: CUGIterator> {
iter: I,
last_record: Option<BusRecord>, }
impl<I> Iterator for CellGroup<I>
where
I: CUGIterator,
{
type Item = (u64, Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
let mut busrecords: Vec<BusRecord> = Vec::new();
loop {
if let Some(new_record) = self.iter.next() {
let new_cb = new_record.CB;
let last_record =
self.last_record.replace(new_record).unwrap();
let current_cb = last_record.CB;
busrecords.push(last_record);
match new_cb.cmp(¤t_cb) {
std::cmp::Ordering::Equal => {} std::cmp::Ordering::Greater => {
return Some((current_cb, busrecords));
}
std::cmp::Ordering::Less => {
panic!("Unsorted busfile: {} -> {}", current_cb, new_cb)
}
}
} else {
let last_record = self.last_record.take();
if let Some(r) = last_record {
let current_cb = r.CB;
busrecords.push(r);
return Some((current_cb, busrecords));
} else {
return None;
}
}
}
}
}
impl<I> CellGroup<I>
where
I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter.next(); Self { iter, last_record }
}
}
pub struct CellGroupFaster<I: CUGIterator> {
iter: I,
current_records: Vec<BusRecord>,
current_cb: u64,
}
impl<I> CellGroupFaster<I>
where
I: CUGIterator,
{
pub fn new(mut iter: I) -> Self {
let last_record = iter
.next()
.expect("expected at least one value in iterator"); let current_cb = last_record.CB;
let current_records = vec![last_record];
Self { iter, current_records, current_cb }
}
}
impl<I> Iterator for CellGroupFaster<I>
where
I: CUGIterator,
{
type Item = (u64, Vec<BusRecord>);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(new_record) = self.iter.next() {
let new_cb = new_record.CB;
match new_cb.cmp(&self.current_cb) {
std::cmp::Ordering::Equal => self.current_records.push(new_record), std::cmp::Ordering::Less => {
panic!("Unsorted busfile: {} -> {}", self.current_cb, new_cb)
}
std::cmp::Ordering::Greater => {
let new_records = vec![new_record];
let to_emit = std::mem::replace(&mut self.current_records, new_records);
let current_cb = std::mem::replace(&mut self.current_cb, new_cb);
return Some((current_cb, to_emit));
}
}
} else {
let to_emit = std::mem::take(&mut self.current_records);
let result = if !to_emit.is_empty() {
let current_cb = std::mem::take(&mut self.current_cb);
Some((current_cb, to_emit))
} else {
None };
return result;
}
}
}
}
pub trait CellGroupIterator: CUGIterator + Sized {
fn groupby_cb(self) -> CellGroupFaster<Self> {
CellGroupFaster::new(self)
}
}
impl<I: CUGIterator> CellGroupIterator for I {}
pub struct GroupbyGene<I> {
iter: I,
ecmapper: Ec2GeneMapper,
}
impl<I> Iterator for GroupbyGene<I>
where
I: Iterator<Item = Vec<BusRecord>>,
{
type Item = Vec<CUGset>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| groubygene(v, &self.ecmapper))
}
}
impl<I> GroupbyGene<I> {
pub fn new(iter: I, ecmapper: Ec2GeneMapper) -> Self {
Self { iter, ecmapper }
}
}
pub trait GroupbyGeneIterator<T>: Iterator<Item = T> + Sized {
fn group_by_gene(self, ecmapper: Ec2GeneMapper) -> GroupbyGene<Self> {
GroupbyGene::new(self, ecmapper)
}
}
impl<T, I: Iterator<Item = T>> GroupbyGeneIterator<T> for I {}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use crate::consistent_genes::{Ec2GeneMapper, Genename, EC};
use crate::io::BusRecord;
use crate::iterators::{CbUmiGroupIterator, CellGroup, CellGroupIterator};
use crate::{hashmap, record, set};
#[test]
fn test_cb_single_elem() {
let r1 = record!(0, 2, 0, 12, 0);
let records = vec![r1.clone()];
let mut it = records.into_iter().groupby_cb();
assert_eq!(it.next(), Some((0, vec![r1])));
assert_eq!(it.next(), None);
}
#[test]
fn test_cbumi_single_elem() {
let r1 = record!(0, 2, 0, 12, 0);
let records = vec![r1.clone()];
let mut it = records.into_iter().groupby_cbumi();
assert_eq!(it.next(), Some(((0, 2), vec![r1])));
assert_eq!(it.next(), None);
}
#[test]
fn test_cb_iter_last_element1() {
let r1 = record!(0, 2, 0, 12, 0);
let r2 = record!(0, 21, 1, 2, 0);
let r3 = record!(1, 2, 0, 12, 0);
let records = vec![r1.clone(), r2.clone(), r3.clone()];
let n: Vec<_> = records
.clone()
.into_iter()
.groupby_cb()
.map(|(_cb, records)| records)
.collect();
assert_eq!(n.len(), 2);
let rlist = &n[1];
assert_eq!(rlist.len(), 1);
let iter = records.into_iter();
let n: Vec<_> = CellGroup::new(iter).map(|(_cb, records)| records).collect();
assert_eq!(n.len(), 2);
let rlist = &n[1];
assert_eq!(rlist.len(), 1);
}
#[test]
fn test_cb_iter_last_element2() {
let r1 = record!(0, 2, 0, 12, 0);
let r2 = record!(0, 21, 1, 2, 0);
let r3 = record!(1, 2, 0, 12, 0);
let r4 = record!(1, 2, 0, 12, 0);
let records = vec![r1.clone(), r2.clone(), r3.clone(), r4.clone()];
let n: Vec<_> = records.into_iter().groupby_cb().collect();
assert_eq!(n.len(), 2);
let (_cb, rlist) = &n[1];
assert_eq!(rlist.len(), 2);
}
#[test]
fn test_cb_iter() {
let r1 = record!(0, 2, 0, 12, 0);
let r2 = record!(0, 21, 1, 2, 0);
let r3 = record!(1, 2, 0, 12, 0);
let r4 = record!(2, 1, 1, 2, 0);
let r5 = record!(2, 21, 1, 2, 0);
let r6 = record!(3, 1, 1, 2, 0);
let records = vec![
r1.clone(),
r2.clone(),
r3.clone(),
r4.clone(),
r5.clone(),
r6.clone(),
];
let n: Vec<(u64, Vec<BusRecord>)> = records.into_iter().groupby_cb().collect();
assert_eq!(n.len(), 4);
let c1 = &n[0];
assert_eq!(*c1, (0, vec![r1, r2]));
let c2 = &n[1];
assert_eq!(*c2, (1, vec![r3]));
let c3 = &n[2];
assert_eq!(*c3, (2, vec![r4, r5]));
let c4 = &n[3];
assert_eq!(*c4, (3, vec![r6]));
}
#[test]
fn test_cbumi_iter() {
let r1 = record!(0, 1, 0, 12, 0);
let r2 = record!(0, 1, 1, 2, 0);
let r3 = record!(0, 2, 0, 12, 0);
let r4 = record!(1, 1, 1, 2, 0);
let r5 = record!(1, 2, 1, 2, 0);
let r6 = record!(2, 1, 1, 2, 0);
let records = vec![
r1.clone(),
r2.clone(),
r3.clone(),
r4.clone(),
r5.clone(),
r6.clone(),
];
let cb_iter = records.into_iter().groupby_cbumi();
let n: Vec<((u64, u64), Vec<BusRecord>)> = cb_iter.collect();
assert_eq!(n.len(), 5);
let c1 = &n[0];
assert_eq!(*c1, ((0, 1), vec![r1, r2]));
let c2 = &n[1];
assert_eq!(*c2, ((0, 2), vec![r3]));
let c3 = &n[2];
assert_eq!(*c3, ((1, 1), vec![r4]));
let c4 = &n[3];
assert_eq!(*c4, ((1, 2), vec![r5]));
let c5 = &n[4];
assert_eq!(*c5, ((2, 1), vec![r6]));
}
#[test]
#[should_panic(expected = "Unsorted busfile: 2 -> 0")]
fn test_panic_on_unsorted() {
let r1 = record!(0, 2, 0, 12, 0);
let r2 = record!(0, 21, 1, 2, 0);
let r3 = record!(2, 2, 0, 12, 0);
let r4 = record!(0, 1, 1, 2, 0);
let records = vec![r1, r2, r3, r4];
records.into_iter().groupby_cb().count();
}
#[test]
#[should_panic(expected = "Unsorted busfile: (2, 2) -> (0, 1)")]
fn test_panic_on_unsorted_cbumi() {
let r1 = record!(0, 2, 0, 12, 0);
let r2 = record!(0, 21, 1, 2, 0);
let r3 = record!(2, 2, 0, 12, 0);
let r4 = record!(0, 1, 1, 2, 0);
let records = vec![r1, r2, r3, r4];
records.into_iter().groupby_cbumi().count();
}
use crate::iterators::GroupbyGeneIterator;
#[test]
fn test_groupby_genes() {
let ec0: HashSet<Genename> = set!(Genename("A".to_string()));
let ec1: HashSet<Genename> = set!(Genename("B".to_string()));
let ec2: HashSet<Genename> = set!(Genename("A".to_string()), Genename("B".to_string()));
let ec3: HashSet<Genename> = set!(Genename("C".to_string()), Genename("D".to_string()));
let ec_dict = hashmap!(
EC(0) => ec0.clone(),
EC(1) => ec1.clone(),
EC(2) => ec2.clone(),
EC(3) => ec3.clone()
);
let es = Ec2GeneMapper::new(ec_dict);
let r1 = record!(0, 1, 0, 2, 0);
let r2 = record!(0, 1, 2, 2, 0);
let s1 = record!(0, 2, 0, 3, 0); let s2 = record!(0, 2, 1, 4, 0);
let records = vec![r1.clone(), r2.clone(), s1.clone(), s2.clone()];
let cb_iter = records.into_iter().groupby_cbumi();
let results: Vec<_> = cb_iter.map(|(_cbumi, r)| r).group_by_gene(es).collect();
assert_eq!(results.len(), 2);
assert_eq!(results[0].len(), 1);
assert_eq!(results[1].len(), 2);
println!("{:?}", results)
}
}