pub struct ParMultiGzipReader<R>where
R: Read,{ /* private fields */ }Expand description
This reader facilitates parallel decompression of BCF data compressed in
the BGZF format—a specialized version of the multi-member gzip file format.
It utilizes internal buffers to sequentially ingest compressed data from
various gzip blocks, leveraging the rayon crate to achieve concurrent
decompression. This design addresses the potential bottleneck in data
processing speed that occurs when decompression is not executed in parallel,
ensuring more efficient handling of compressed data streams.
Example:
use bcf_reader::*;
use std::fs::File;
use std::io::BufReader;
use std::io::Write;
// read data generated by bcftools
// bcftools query -f '[\t%GT]\n' test.bcf | bgzip -c > test_gt.gz
let mut gt_str = String::new();
smart_reader("testdata/test_gt.gz")
.unwrap()
.read_to_string(&mut gt_str)
.unwrap();
// read data via bcf-reader
let max_gzip_block_in_buffer = 10;
let reader = File::open("testdata/test.bcf").map(BufReader::new).unwrap();
let mut f =
ParMultiGzipReader::from_reader(reader, max_gzip_block_in_buffer, None, None).unwrap();
let s = read_header(&mut f).unwrap();
let header = Header::from_string(&s).unwrap();
let mut record = Record::default();
let mut gt_str2 = Vec::<u8>::new();
while let Ok(_) = record.read(&mut f) {
for (i, bn) in record.fmt_gt(&header).enumerate() {
let bn = bn.unwrap();
let (noploidy, dot, phased, allele) = bn.gt_val();
assert_eq!(noploidy, false); // missing ploidy
let mut sep = '\t';
if i % 2 == 1 {
if phased {
sep = '|';
} else {
sep = '/';
}
}
if dot {
write!(gt_str2, "{sep}.").unwrap();
} else {
write!(gt_str2, "{sep}{allele}").unwrap();
}
}
write!(gt_str2, "\n").unwrap();
}
let gt_str2 = String::from_utf8(gt_str2).unwrap();
// compare bcftools results and bcf-reader results
for (a, b) in gt_str
.split(|c| (c == '\n') || (c == '\t'))
.zip(gt_str2.split(|c| (c == '\n') || (c == '\t')))
{
assert_eq!(a, b);
}See ParMultiGzipReader::from_reader for an example to jump to a target
genome interval.
Implementations§
Source§impl<R> ParMultiGzipReader<R>where
R: Read,
impl<R> ParMultiGzipReader<R>where
R: Read,
Sourcepub fn from_reader(
reader: R,
ngzip_max: usize,
coffset: Option<u64>,
uoffset: Option<u64>,
) -> Result<Self>
pub fn from_reader( reader: R, ngzip_max: usize, coffset: Option<u64>, uoffset: Option<u64>, ) -> Result<Self>
Constructs a new ParMultiGzipReader by specifying the ngzip_max parameter,
which defines the maximum number of gzip blocks that the internal buffers can
handle simultaneously. This parameter should ideally be set to the number of
CPU cores available to optimize parallel decompression performance, thereby
leveraging the hardware’s concurrency capabilities.
The coffset parameter indicates the offset to the first byte of a
target gzip block of the input reader. The input reader should point to
the start byte of a gzip block as indicated by coffset before passing
the reader to ParMultiGzipReader::from_reader; otherwise,
Seek::seek should be used on the input reader to adjust the position
accordingly. Note that ParMultiGzipReader does not call Seek::seek
on the reader.
The uoffset parameter specifies the number of bytes to skip within the
first decompressed gzip data. Since skipping within uncompressed data
requires decompression, this offset is applied within the
ParMultiGzipReader::from_reader method.
§Examples
use bcf_reader::*;
use std::{
fs::File,
io::{BufReader, Seek},
};
// index file
let csi = Csi::from_path("testdata/test3.bcf.csi").unwrap();
// reader
let mut reader = File::open("testdata/test3.bcf")
.map(BufReader::new)
.unwrap();
// calculate first offsets of the target postion
let start = 1495403 - 1;
let end = 1495746 - 1;
let chrom_id = 0;
let bin_id = csi.get_bin_id(start, start + 1 as i64);
let bin_details = csi.get_bin_details(chrom_id, bin_id).unwrap();
let (coffset, uoffset) = bin_details.chunks()[0].chunk_beg.get_coffset_uoffset();
// seek to the target bgzip block
reader.seek(std::io::SeekFrom::Start(coffset)).unwrap();
// create the parallelizable reader by wraping around the existing reader
// and specifing offsets
let mut reader =
ParMultiGzipReader::from_reader(reader, 1, Some(coffset), Some(uoffset)).unwrap();
let mut record = Record::default();
let mut pos_found = vec![];
while let Ok(_) = record.read(&mut reader) {
let pos = record.pos() as i64;
// the bin containing the start position of target interval may have records
// before the start position, so skip them.
if pos < start {
continue;
}
// read the record until out of the target interval
else if pos >= end {
break;
}
pos_found.push(pos);
}
assert_eq!(pos_found, vec![start]);§Parameters
reader: The input reader from which gzip blocks will be read.ngzip_max: The maximum number of gzip blocks that can be processed in parallel.coffset: An optional offset to the start of a gzip block in the input reader.uoffset: An optional offset within the first block of decompressed data.
§Returns
Returns a new instance of ParMultiGzipReader.
pub fn get_coffset_uoffset(&self) -> (u64, u64)
Trait Implementations§
Source§impl<R> Read for ParMultiGzipReader<R>where
R: Read,
impl<R> Read for ParMultiGzipReader<R>where
R: Read,
Source§fn read(&mut self, buf: &mut [u8]) -> Result<usize>
fn read(&mut self, buf: &mut [u8]) -> Result<usize>
1.36.0 · Source§fn read_vectored(&mut self, bufs: &mut [IoSliceMut<'_>]) -> Result<usize, Error>
fn read_vectored(&mut self, bufs: &mut [IoSliceMut<'_>]) -> Result<usize, Error>
read, except that it reads into a slice of buffers. Read moreSource§fn is_read_vectored(&self) -> bool
fn is_read_vectored(&self) -> bool
can_vector)1.0.0 · Source§fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error>
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize, Error>
buf. Read more1.0.0 · Source§fn read_to_string(&mut self, buf: &mut String) -> Result<usize, Error>
fn read_to_string(&mut self, buf: &mut String) -> Result<usize, Error>
buf. Read more1.6.0 · Source§fn read_exact(&mut self, buf: &mut [u8]) -> Result<(), Error>
fn read_exact(&mut self, buf: &mut [u8]) -> Result<(), Error>
buf. Read moreSource§fn read_buf(&mut self, buf: BorrowedCursor<'_>) -> Result<(), Error>
fn read_buf(&mut self, buf: BorrowedCursor<'_>) -> Result<(), Error>
read_buf)Source§fn read_buf_exact(&mut self, cursor: BorrowedCursor<'_>) -> Result<(), Error>
fn read_buf_exact(&mut self, cursor: BorrowedCursor<'_>) -> Result<(), Error>
read_buf)cursor. Read more1.0.0 · Source§fn by_ref(&mut self) -> &mut Selfwhere
Self: Sized,
fn by_ref(&mut self) -> &mut Selfwhere
Self: Sized,
Read. Read more1.0.0 · Source§fn chain<R>(self, next: R) -> Chain<Self, R>
fn chain<R>(self, next: R) -> Chain<Self, R>
Auto Trait Implementations§
impl<R> Freeze for ParMultiGzipReader<R>where
R: Freeze,
impl<R> RefUnwindSafe for ParMultiGzipReader<R>where
R: RefUnwindSafe,
impl<R> Send for ParMultiGzipReader<R>where
R: Send,
impl<R> Sync for ParMultiGzipReader<R>where
R: Sync,
impl<R> Unpin for ParMultiGzipReader<R>where
R: Unpin,
impl<R> UnwindSafe for ParMultiGzipReader<R>where
R: UnwindSafe,
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<R> ReadBytesExt for R
impl<R> ReadBytesExt for R
Source§fn read_u8(&mut self) -> Result<u8, Error>
fn read_u8(&mut self) -> Result<u8, Error>
Source§fn read_i8(&mut self) -> Result<i8, Error>
fn read_i8(&mut self) -> Result<i8, Error>
Source§fn read_u16<T>(&mut self) -> Result<u16, Error>where
T: ByteOrder,
fn read_u16<T>(&mut self) -> Result<u16, Error>where
T: ByteOrder,
Source§fn read_i16<T>(&mut self) -> Result<i16, Error>where
T: ByteOrder,
fn read_i16<T>(&mut self) -> Result<i16, Error>where
T: ByteOrder,
Source§fn read_u24<T>(&mut self) -> Result<u32, Error>where
T: ByteOrder,
fn read_u24<T>(&mut self) -> Result<u32, Error>where
T: ByteOrder,
Source§fn read_i24<T>(&mut self) -> Result<i32, Error>where
T: ByteOrder,
fn read_i24<T>(&mut self) -> Result<i32, Error>where
T: ByteOrder,
Source§fn read_u32<T>(&mut self) -> Result<u32, Error>where
T: ByteOrder,
fn read_u32<T>(&mut self) -> Result<u32, Error>where
T: ByteOrder,
Source§fn read_i32<T>(&mut self) -> Result<i32, Error>where
T: ByteOrder,
fn read_i32<T>(&mut self) -> Result<i32, Error>where
T: ByteOrder,
Source§fn read_u48<T>(&mut self) -> Result<u64, Error>where
T: ByteOrder,
fn read_u48<T>(&mut self) -> Result<u64, Error>where
T: ByteOrder,
Source§fn read_i48<T>(&mut self) -> Result<i64, Error>where
T: ByteOrder,
fn read_i48<T>(&mut self) -> Result<i64, Error>where
T: ByteOrder,
Source§fn read_u64<T>(&mut self) -> Result<u64, Error>where
T: ByteOrder,
fn read_u64<T>(&mut self) -> Result<u64, Error>where
T: ByteOrder,
Source§fn read_i64<T>(&mut self) -> Result<i64, Error>where
T: ByteOrder,
fn read_i64<T>(&mut self) -> Result<i64, Error>where
T: ByteOrder,
Source§fn read_u128<T>(&mut self) -> Result<u128, Error>where
T: ByteOrder,
fn read_u128<T>(&mut self) -> Result<u128, Error>where
T: ByteOrder,
Source§fn read_i128<T>(&mut self) -> Result<i128, Error>where
T: ByteOrder,
fn read_i128<T>(&mut self) -> Result<i128, Error>where
T: ByteOrder,
Source§fn read_uint<T>(&mut self, nbytes: usize) -> Result<u64, Error>where
T: ByteOrder,
fn read_uint<T>(&mut self, nbytes: usize) -> Result<u64, Error>where
T: ByteOrder,
Source§fn read_int<T>(&mut self, nbytes: usize) -> Result<i64, Error>where
T: ByteOrder,
fn read_int<T>(&mut self, nbytes: usize) -> Result<i64, Error>where
T: ByteOrder,
Source§fn read_uint128<T>(&mut self, nbytes: usize) -> Result<u128, Error>where
T: ByteOrder,
fn read_uint128<T>(&mut self, nbytes: usize) -> Result<u128, Error>where
T: ByteOrder,
Source§fn read_int128<T>(&mut self, nbytes: usize) -> Result<i128, Error>where
T: ByteOrder,
fn read_int128<T>(&mut self, nbytes: usize) -> Result<i128, Error>where
T: ByteOrder,
Source§fn read_f32<T>(&mut self) -> Result<f32, Error>where
T: ByteOrder,
fn read_f32<T>(&mut self) -> Result<f32, Error>where
T: ByteOrder,
Source§fn read_f64<T>(&mut self) -> Result<f64, Error>where
T: ByteOrder,
fn read_f64<T>(&mut self) -> Result<f64, Error>where
T: ByteOrder,
Source§fn read_u16_into<T>(&mut self, dst: &mut [u16]) -> Result<(), Error>where
T: ByteOrder,
fn read_u16_into<T>(&mut self, dst: &mut [u16]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_u32_into<T>(&mut self, dst: &mut [u32]) -> Result<(), Error>where
T: ByteOrder,
fn read_u32_into<T>(&mut self, dst: &mut [u32]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_u64_into<T>(&mut self, dst: &mut [u64]) -> Result<(), Error>where
T: ByteOrder,
fn read_u64_into<T>(&mut self, dst: &mut [u64]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_u128_into<T>(&mut self, dst: &mut [u128]) -> Result<(), Error>where
T: ByteOrder,
fn read_u128_into<T>(&mut self, dst: &mut [u128]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_i8_into(&mut self, dst: &mut [i8]) -> Result<(), Error>
fn read_i8_into(&mut self, dst: &mut [i8]) -> Result<(), Error>
Source§fn read_i16_into<T>(&mut self, dst: &mut [i16]) -> Result<(), Error>where
T: ByteOrder,
fn read_i16_into<T>(&mut self, dst: &mut [i16]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_i32_into<T>(&mut self, dst: &mut [i32]) -> Result<(), Error>where
T: ByteOrder,
fn read_i32_into<T>(&mut self, dst: &mut [i32]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_i64_into<T>(&mut self, dst: &mut [i64]) -> Result<(), Error>where
T: ByteOrder,
fn read_i64_into<T>(&mut self, dst: &mut [i64]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_i128_into<T>(&mut self, dst: &mut [i128]) -> Result<(), Error>where
T: ByteOrder,
fn read_i128_into<T>(&mut self, dst: &mut [i128]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_f32_into<T>(&mut self, dst: &mut [f32]) -> Result<(), Error>where
T: ByteOrder,
fn read_f32_into<T>(&mut self, dst: &mut [f32]) -> Result<(), Error>where
T: ByteOrder,
Source§fn read_f32_into_unchecked<T>(&mut self, dst: &mut [f32]) -> Result<(), Error>where
T: ByteOrder,
fn read_f32_into_unchecked<T>(&mut self, dst: &mut [f32]) -> Result<(), Error>where
T: ByteOrder,
read_f32_into instead