use libdeflater::Decompressor;
use positioned_io::ReadAt;
use std::cell::Cell;
use std::collections::BTreeMap;
use std::error;
use std::fs::File;
use std::ops::Bound::{Excluded, Included};
use std::str;
use std::{error::Error, fmt};
#[derive(Copy, Clone)]
struct BgzfBlock {
data_offset: u64,
data_length: u32,
input_length: u32,
block_size: u32,
}
pub struct BgzfReader {
bgzf_file: File,
block_tree: BTreeMap<u64, BgzfBlock>,
pub input_length: u64,
pub current_read_position: Cell<u64>,
pub pos: Cell<u64>,
}
impl BgzfReader {
pub fn new(file_path: String) -> Result<BgzfReader, Box<dyn error::Error>> {
let mut b_tree = BTreeMap::new();
let bgzf_file = File::open(file_path)?;
let mut input_offset: u64 = 0;
let mut current_file_position = 0;
loop {
match read_block(&bgzf_file, current_file_position) {
Ok(option_block) => match option_block {
Some(block) => {
let input_length_block = block.input_length;
let block_size_block = block.block_size;
b_tree.insert(input_offset, block);
input_offset += u64::from(input_length_block);
current_file_position += u64::from(block_size_block);
}
None => break,
},
Err(_e) => break,
}
}
let reader = BgzfReader {
bgzf_file,
block_tree: b_tree,
input_length: input_offset,
current_read_position: Cell::new(0),
pos: Cell::new(0),
};
Ok(reader)
}
pub fn seek(&self, pos: u64) {
self.pos.set(pos);
}
pub fn total_uncompressed_length(&self) -> u64 {
self.input_length
}
pub fn read_to(&self, b: &mut Vec<u8>) -> Result<i32, Box<dyn error::Error>> {
self.read(b, 0, b.len())
}
pub fn read(
&self,
b: &mut Vec<u8>,
off: usize,
len: usize,
) -> Result<i32, Box<dyn error::Error>> {
if b.len() == 0 {
return Err(BGZFError::new("Buffer size needs to be greater than 0").into());
}
if len > b.len() - off {
return Err(BGZFError::new("Index out of bound exception").into());
}
if len == 0 {
return Ok(0);
}
if self.pos.get() >= self.input_length {
return Ok(-1);
}
let mut off = off;
let mut len = len;
let mut un_compressor = Decompressor::new();
let mut cb: i32 = 0;
#[derive(Copy, Clone)]
struct Entry {
key: u64,
value: BgzfBlock,
}
let mut entry_vector: Vec<Entry> = Vec::new();
if !self.block_tree.contains_key(&self.pos.get()) {
let floored_value = self.block_tree.range(..self.pos.get()).next_back().unwrap();
entry_vector.push(Entry {
key: *floored_value.0,
value: *floored_value.1,
});
}
let pos_and_len_combined = self.pos.get() + len as u64;
for (&key, &value) in self
.block_tree
.range((Included(self.pos.get()), Excluded(pos_and_len_combined)))
{
entry_vector.push(Entry { key, value });
}
for entry in entry_vector {
let block = entry.value;
let input_offset = entry.key;
let mut compressed = vec![0u8; block.data_length as usize];
self
.bgzf_file
.read_exact_at(block.data_offset, &mut compressed)?;
let mut uncompressed = vec![0u8; block.input_length as usize];
let bytes_decompressed =
un_compressor.deflate_decompress(&mut compressed, &mut uncompressed)?;
if bytes_decompressed == 0 || bytes_decompressed != block.input_length as usize {
return Err(BGZFError::new("Did not fully de-compress").into());
}
let mut copy_start: u64 = 0;
let mut copy_length = block.input_length;
if input_offset < self.pos.get() {
let copy_skip = self.pos.get() - input_offset;
copy_start += copy_skip;
copy_length -= copy_skip as u32;
}
if copy_length > len as u32 {
copy_length = len as u32;
}
let end_index = copy_start + u64::from(copy_length);
b[off..].copy_from_slice(&uncompressed[copy_start as usize..end_index as usize]);
len -= copy_length as usize;
self.pos.set(self.pos.get() + u64::from(copy_length));
off += copy_length as usize;
cb += copy_length as i32;
}
Ok(cb)
}
}
fn read_block(
file: &File,
current_file_position: u64,
) -> Result<Option<BgzfBlock>, Box<dyn error::Error>> {
let mut current_file_position = current_file_position;
let mut buf = [0; 12];
file.read_exact_at(current_file_position, &mut buf)?;
current_file_position += buf.len() as u64;
if buf[0] != 31 || buf[1] != 139 || buf[2] != 8 || buf[3] != 4 {
return Err(BGZFError::new("Incorrect header").into());
}
let xlen: u16 = (buf[10] as u16) | ((buf[11] as u16) << 8);
let mut buf_xlen = vec![0u8; usize::from(xlen)];
file.read_exact_at(current_file_position, &mut buf_xlen)?;
current_file_position += buf_xlen.len() as u64;
if buf_xlen[0] != 66 || buf_xlen[1] != 67 {
return Err(BGZFError::new("Bad subfield Identifier").into());
}
if ((buf_xlen[2] as u16) | ((buf_xlen[3] as u16) << 8)) != 2 {
return Err(BGZFError::new("Bad subfield Length").into());
}
let bsize = (buf_xlen[4] as u16) | ((buf_xlen[5] as u16) << 8);
let block_size = u32::from(bsize) + 1;
let data_length = bsize - xlen - 19;
let data_offset = current_file_position;
current_file_position += u64::from(data_length) + 4;
let mut buf_isize = [0; 4];
file.read_exact_at(current_file_position, &mut buf_isize)?;
let i_size: u32 = (buf_isize[0] as u32)
| ((buf_isize[1] as u32) << 8)
| ((buf_isize[2] as u32) << 16)
| ((buf_isize[3] as u32) << 24);
if i_size == 0 {
return Ok(None);
}
let block = BgzfBlock {
data_offset,
data_length: u32::from(data_length),
input_length: u32::from(i_size),
block_size,
};
Ok(Some(block))
}
#[derive(Debug)]
struct BGZFError {
msg: String,
}
impl BGZFError {
fn new(msg: &str) -> BGZFError {
BGZFError {
msg: msg.to_string(),
}
}
}
impl Error for BGZFError {
fn description(&self) -> &str {
&self.msg
}
}
impl fmt::Display for BGZFError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.msg)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_read_block_func() {
let bgzf_file = File::open("bgzf_test.bgz").unwrap();
match read_block(&bgzf_file, 0) {
Ok(option_block) => match option_block {
Some(block) => {
assert_eq!(block.block_size, 211);
assert_eq!(block.data_length, 185);
assert_eq!(block.data_offset, 18);
assert_eq!(block.input_length, 280);
}
None => assert!(false),
},
Err(_e) => assert!(false),
}
}
#[test]
fn test_bgzf_reader_new_func() {
let bgzf_reader = BgzfReader::new(String::from("bgzf_test.bgz"));
match bgzf_reader {
Ok(reader) => {
let expected_uncompressed_length = 280;
assert_eq!(1, reader.block_tree.len());
assert_eq!(expected_uncompressed_length, reader.input_length);
assert_eq!(0, reader.current_read_position.get());
let block = reader.block_tree.get(&0);
match block {
Some(block) => {
assert_eq!(block.block_size, 211);
assert_eq!(block.data_length, 185);
assert_eq!(block.data_offset, 18);
assert_eq!(block.input_length, 280);
}
None => assert!(false),
}
}
Err(_e) => assert!(false),
}
}
#[test]
fn test_bgzf_read_method() {
let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
let mut content = vec![0; 10];
match reader.read(&mut content, 0, 10) {
Ok(val) => {
assert_eq!(10, val);
}
Err(e) => {
assert!(false);
println!("The error occurred is : {}", e);
}
};
let file_content = str::from_utf8(&content).unwrap();
assert_eq!("This is ju", file_content);
reader.seek(20);
println!(
"Current read position is: {}",
reader.current_read_position.get()
);
let mut content_two = vec![0; 32];
match reader.read(&mut content_two, 0, 32) {
Ok(val) => {
assert_eq!(32, val);
}
Err(e) => {
assert!(false);
println!("The error occurred is : {}", e);
}
};
let file_content_two = str::from_utf8(&content_two).unwrap();
assert_eq!("test,lets see how it reacts. :).", file_content_two);
}
#[test]
fn test_seek_method() {
let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
reader.seek(33);
assert_eq!(0, reader.current_read_position.get());
assert_eq!(33, reader.pos.get());
}
#[test]
fn test_read_to() {
let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
let mut vec = vec![0; 52];
let data_read = reader.read_to(&mut vec);
assert_eq!(data_read.unwrap(), 52);
assert_eq!(
"This is just a bgzf test,lets see how it reacts. :).",
str::from_utf8(&vec).unwrap()
);
}
}