use std::io::{
Read,
Write,
};
use indexmap::IndexSet;
use itertools::Itertools;
use polars::error::PolarsResult;
use serde::{
Deserialize,
Serialize,
};
use super::BsxFileReader;
use crate::data_structs::coords::{
Contig,
ContigIntervalMap,
};
use crate::data_structs::typedef::BsxSmallStr;
use crate::prelude::Strand;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BatchIndex {
map: ContigIntervalMap<usize>,
chr_order: IndexSet<BsxSmallStr>,
}
impl FromIterator<(Contig, usize)> for BatchIndex {
fn from_iter<I: IntoIterator<Item = (Contig, usize)>>(iter: I) -> Self {
let mut index = Self::new();
for (contig, batch_idx) in iter {
index.insert(contig, batch_idx);
}
index
}
}
impl BatchIndex {
pub fn to_file<W: Write>(
self,
writer: &mut W,
) -> Result<(), bincode::error::EncodeError> {
let config = bincode::config::standard();
bincode::serde::encode_into_std_write(self, writer, config)?;
Ok(())
}
pub fn from_file<R: Read>(
reader: &mut R
) -> Result<Self, bincode::error::DecodeError> {
let config = bincode::config::standard();
bincode::serde::decode_from_std_read(reader, config)
}
pub fn from_reader(reader: &mut BsxFileReader) -> PolarsResult<Self> {
let contigs = reader
.iter()
.enumerate()
.map(|(batch_idx, batch)| {
batch.map(|b| (b.as_contig().unwrap(), batch_idx))
})
.collect::<PolarsResult<Vec<_>>>()?;
Ok(Self::from_iter(contigs))
}
}
impl Default for BatchIndex {
fn default() -> Self {
Self::new()
}
}
impl BatchIndex {
pub fn new() -> Self {
Self {
map: ContigIntervalMap::new(),
chr_order: IndexSet::new(),
}
}
pub fn insert(
&mut self,
contig: Contig,
batch_idx: usize,
) {
self.chr_order.insert(contig.seqname().clone());
self.map.insert(contig, batch_idx);
}
pub fn sort<I>(
&self,
contigs: I,
) -> impl Iterator<Item = Contig>
where
I: IntoIterator<Item = Contig>, {
contigs
.into_iter()
.map(|contig| {
(
self.chr_order.get_index_of(contig.seqname()).unwrap_or(0),
contig,
)
})
.sorted_by(|(left_chr, left_contig), (right_chr, right_contig)| {
left_chr
.cmp(right_chr)
.then(left_contig.start().cmp(&right_contig.start()))
})
.map(|(_, contig)| contig)
}
pub fn find(
&self,
contig: &Contig,
) -> Option<Vec<usize>> {
self.map
.find(contig)
.map(|v| v.into_iter().cloned().collect())
}
pub fn chr_indices<R: AsRef<str>>(
&self,
chr: R,
) -> Option<Vec<usize>> {
self.find(&Contig::new(chr.as_ref().into(), 0, u32::MAX, Strand::None))
}
pub fn get_chr_order(&self) -> &IndexSet<BsxSmallStr> {
&self.chr_order
}
pub fn get_chr_index(
&self,
chr: &BsxSmallStr,
) -> Option<usize> {
self.chr_order.get_index_of(chr)
}
pub fn map(&self) -> &ContigIntervalMap<usize> {
&self.map
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::data_structs::Strand;
#[test]
fn test_insert_and_find() {
let mut index: BatchIndex = BatchIndex::new();
let contig1 = Contig::new("chr1".into(), 1, 100, Strand::None);
let contig2 = Contig::new("chr1".into(), 50, 150, Strand::None);
let contig3 = Contig::new("chr2".into(), 1, 100, Strand::None);
index.insert(contig1.clone(), 1);
index.insert(contig2.clone(), 2);
index.insert(contig3.clone(), 3);
let query_contig1 = Contig::new("chr1".into(), 20, 60, Strand::None);
let result1 = index.find(&query_contig1).unwrap();
assert_eq!(result1, vec![1, 2]);
let query_contig2 = Contig::new("chr1".into(), 200, 300, Strand::None);
let result2 = index.find(&query_contig2);
assert_eq!(result2, Some(vec![]));
let query_contig3 = Contig::new("chr2".into(), 50, 60, Strand::None);
let result3 = index.find(&query_contig3).unwrap();
assert_eq!(result3, vec![3]);
}
#[test]
fn test_sort() {
let mut index: BatchIndex = BatchIndex::new();
index.chr_order.insert("chr2".into());
index.chr_order.insert("chr1".into());
let contig1 = Contig::new("chr1".into(), 50, 150, Strand::None);
let contig2 = Contig::new("chr1".into(), 1, 100, Strand::None);
let contig3 = Contig::new("chr2".into(), 1, 100, Strand::None);
let contig4 = Contig::new("chr2".into(), 50, 150, Strand::None);
let contigs = vec![
contig1.clone(),
contig2.clone(),
contig3.clone(),
contig4.clone(),
];
let sorted_contigs: Vec<_> = index.sort(contigs).collect();
assert_eq!(sorted_contigs[0], contig3);
assert_eq!(sorted_contigs[1], contig4);
assert_eq!(sorted_contigs[2], contig2);
assert_eq!(sorted_contigs[3], contig1);
}
}