1use libdeflater::Decompressor;
2use positioned_io::ReadAt;
3use std::cell::Cell;
4use std::cell::RefCell;
5use std::cmp::min;
6use std::collections::BTreeMap;
7use std::error;
8use std::fs::File;
9use std::ops::Bound::{Excluded, Included};
10use std::str;
11use std::{error::Error, fmt};
12
13#[derive(Copy, Clone)]
20struct BgzfBlock {
21 data_offset: u64,
22 data_length: u32,
23 input_length: u32,
24 block_size: u32,
25}
26
27#[derive(Clone)]
29struct Cache {
30 pos: u64,
31 uncompressed_data: Vec<u8>,
32}
33
34pub struct BgzfReader {
42 bgzf_file: File,
43 block_tree: BTreeMap<u64, BgzfBlock>,
44 cache: RefCell<Option<Cache>>,
45 pub input_length: u64,
46 pub current_read_position: Cell<u64>,
47 pub pos: Cell<u64>,
48}
49
50impl BgzfReader {
70 pub fn new(file_path: String) -> Result<BgzfReader, Box<dyn error::Error>> {
71 let mut b_tree = BTreeMap::new();
72 let bgzf_file = File::open(file_path)?;
73 let mut input_offset: u64 = 0;
74 let mut current_file_position = 0;
75 loop {
76 match read_block(&bgzf_file, current_file_position) {
77 Ok(option_block) => match option_block {
78 Some(block) => {
79 let input_length_block = block.input_length;
80 let block_size_block = block.block_size;
81 b_tree.insert(input_offset, block);
82 input_offset += u64::from(input_length_block);
83 current_file_position += u64::from(block_size_block);
84 }
85 None => break,
86 },
87 Err(_e) => break,
88 }
89 }
90 let reader = BgzfReader {
91 bgzf_file,
92 block_tree: b_tree,
93 input_length: input_offset,
94 current_read_position: Cell::new(0),
95 pos: Cell::new(0),
96 cache: RefCell::new(None),
97 };
98 Ok(reader)
99 }
100
101 pub fn seek(&self, pos: u64) {
114 self.pos.set(pos);
115 }
116
117 pub fn total_uncompressed_length(&self) -> u64 {
119 self.input_length
120 }
121
122 pub fn read_to(&self, b: &mut Vec<u8>) -> Result<i32, Box<dyn error::Error>> {
140 self.read(b, 0, b.len())
141 }
142
143 pub fn read(
166 &self,
167 b: &mut Vec<u8>,
168 off: usize,
169 len: usize,
170 ) -> Result<i32, Box<dyn error::Error>> {
171 if b.len() == 0 {
172 return Err(BGZFError::new("Buffer size needs to be greater than 0").into());
173 }
174 if len > b.len() - off {
175 return Err(BGZFError::new("Index out of bound exception").into());
176 }
177 if len == 0 {
178 return Ok(0);
179 }
180 if self.pos.get() >= self.input_length {
181 return Ok(-1);
182 }
183
184 let mut off = off;
185 let mut len = len;
186 let mut cb: i32 = 0;
187
188 match self.cache.borrow().as_ref() {
189 Some(cache) => {
190 if self.pos.get() >= cache.pos {
191 let bytes_available_in_cache =
192 cache.pos as usize + cache.uncompressed_data.len() - self.pos.get() as usize;
193 if bytes_available_in_cache > 0 {
194 let copy_start = (self.pos.get() - cache.pos) as usize;
195 let copy_length = min(bytes_available_in_cache, len);
196 let end_index = copy_start + copy_length;
197 b[off..]
198 .copy_from_slice(&cache.uncompressed_data[copy_start as usize..end_index as usize]);
199 cb += copy_length as i32;
200 off += copy_length;
201 len -= copy_length;
202 self.pos.set(self.pos.get() + copy_length as u64);
203 if len == 0 {
204 return Ok(cb);
205 }
206 }
207 }
208 }
209 None => {
210 }
212 }
213
214 let mut un_compressor = Decompressor::new();
215
216 #[derive(Copy, Clone)]
217 struct Entry {
218 key: u64,
219 value: BgzfBlock,
220 }
221
222 let mut entry_vector: Vec<Entry> = Vec::new();
223
224 if !self.block_tree.contains_key(&self.pos.get()) {
225 let floored_value = self.block_tree.range(..self.pos.get()).next_back().unwrap();
226 entry_vector.push(Entry {
228 key: *floored_value.0,
229 value: *floored_value.1,
230 });
231 }
232 let pos_and_len_combined = self.pos.get() + len as u64;
235 for (&key, &value) in self
236 .block_tree
237 .range((Included(self.pos.get()), Excluded(pos_and_len_combined)))
238 {
239 entry_vector.push(Entry { key, value });
240 }
241
242 for entry in entry_vector {
243 let block = entry.value;
244 let input_offset = entry.key;
245
246 let mut compressed = vec![0u8; block.data_length as usize];
248 self
249 .bgzf_file
250 .read_exact_at(block.data_offset, &mut compressed)?;
251
252 let mut uncompressed = vec![0u8; block.input_length as usize];
254 let bytes_decompressed =
255 un_compressor.deflate_decompress(&mut compressed, &mut uncompressed)?;
256
257 if bytes_decompressed == 0 || bytes_decompressed != block.input_length as usize {
258 return Err(BGZFError::new("Did not fully de-compress").into());
259 }
260
261 self.cache.replace(Some(Cache {
262 pos: input_offset,
263 uncompressed_data: uncompressed.clone(),
264 }));
265
266 let mut copy_start: u64 = 0;
267 let mut copy_length = block.input_length;
269 if input_offset < self.pos.get() {
270 let copy_skip = self.pos.get() - input_offset;
271 copy_start += copy_skip;
272 copy_length -= copy_skip as u32;
273 }
274
275 if copy_length > len as u32 {
276 copy_length = len as u32;
277 }
278 let end_index = copy_start + u64::from(copy_length);
279 b[off..].copy_from_slice(&uncompressed[copy_start as usize..end_index as usize]);
280 len -= copy_length as usize;
281 self.pos.set(self.pos.get() + u64::from(copy_length));
282 off += copy_length as usize;
283 cb += copy_length as i32;
284 }
285 Ok(cb)
286 }
287}
288
289fn read_block(
290 file: &File,
291 current_file_position: u64,
292) -> Result<Option<BgzfBlock>, Box<dyn error::Error>> {
293 let mut current_file_position = current_file_position;
294
295 let mut buf = [0; 12];
296 file.read_exact_at(current_file_position, &mut buf)?;
297 current_file_position += buf.len() as u64;
298
299 if buf[0] != 31 || buf[1] != 139 || buf[2] != 8 || buf[3] != 4 {
300 return Err(BGZFError::new("Incorrect header").into());
301 }
302
303 let xlen: u16 = (buf[10] as u16) | ((buf[11] as u16) << 8);
304
305 let mut buf_xlen = vec![0u8; usize::from(xlen)];
306
307 file.read_exact_at(current_file_position, &mut buf_xlen)?;
308 current_file_position += buf_xlen.len() as u64;
309
310 if buf_xlen[0] != 66 || buf_xlen[1] != 67 {
311 return Err(BGZFError::new("Bad subfield Identifier").into());
312 }
313
314 if ((buf_xlen[2] as u16) | ((buf_xlen[3] as u16) << 8)) != 2 {
315 return Err(BGZFError::new("Bad subfield Length").into());
316 }
317
318 let bsize = (buf_xlen[4] as u16) | ((buf_xlen[5] as u16) << 8);
319 let block_size = u32::from(bsize) + 1;
320 let data_length = bsize - xlen - 19;
321 let data_offset = current_file_position;
322
323 current_file_position += u64::from(data_length) + 4;
325
326 let mut buf_isize = [0; 4];
327 file.read_exact_at(current_file_position, &mut buf_isize)?;
328
329 let i_size: u32 = (buf_isize[0] as u32)
330 | ((buf_isize[1] as u32) << 8)
331 | ((buf_isize[2] as u32) << 16)
332 | ((buf_isize[3] as u32) << 24);
333
334 if i_size == 0 {
335 return Ok(None);
336 }
337
338 let block = BgzfBlock {
339 data_offset,
340 data_length: u32::from(data_length),
341 input_length: u32::from(i_size),
342 block_size,
343 };
344 Ok(Some(block))
345}
346
347#[derive(Debug)]
348struct BGZFError {
349 msg: String,
350}
351
352impl BGZFError {
353 fn new(msg: &str) -> BGZFError {
354 BGZFError {
355 msg: msg.to_string(),
356 }
357 }
358}
359
360impl Error for BGZFError {
361 fn description(&self) -> &str {
362 &self.msg
363 }
364}
365
366impl fmt::Display for BGZFError {
367 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
368 write!(f, "{}", self.msg)
369 }
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 #[test]
377 fn test_read_block_func() {
378 let bgzf_file = File::open("bgzf_test.bgz").unwrap();
379 match read_block(&bgzf_file, 0) {
380 Ok(option_block) => match option_block {
381 Some(block) => {
382 assert_eq!(block.block_size, 211);
383 assert_eq!(block.data_length, 185);
384 assert_eq!(block.data_offset, 18);
385 assert_eq!(block.input_length, 280);
386 }
387 None => assert!(false),
388 },
389 Err(_e) => assert!(false),
390 }
391 }
392
393 #[test]
394 fn test_bgzf_reader_new_func() {
395 let bgzf_reader = BgzfReader::new(String::from("bgzf_test.bgz"));
396 match bgzf_reader {
397 Ok(reader) => {
398 let expected_uncompressed_length = 280;
399 assert_eq!(1, reader.block_tree.len());
400 assert_eq!(expected_uncompressed_length, reader.input_length);
401 assert_eq!(0, reader.current_read_position.get());
402
403 let block = reader.block_tree.get(&0);
404 match block {
405 Some(block) => {
406 assert_eq!(block.block_size, 211);
407 assert_eq!(block.data_length, 185);
408 assert_eq!(block.data_offset, 18);
409 assert_eq!(block.input_length, 280);
410 }
411 None => assert!(false),
412 }
413 }
414 Err(_e) => assert!(false),
415 }
416 }
417
418 #[test]
419 fn test_bgzf_read_method() {
420 let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
421 let mut content = vec![0; 10];
422 match reader.read(&mut content, 0, 10) {
423 Ok(val) => {
424 assert_eq!(10, val);
425 }
426 Err(e) => {
427 assert!(false);
428 }
429 };
430 let file_content = str::from_utf8(&content).unwrap();
431 assert_eq!("This is ju", file_content);
432
433 reader.seek(20);
434 let mut content_two = vec![0; 32];
435 match reader.read(&mut content_two, 0, 32) {
436 Ok(val) => {
437 assert_eq!(32, val);
438 }
439 Err(_e) => {
440 assert!(false);
441 }
442 };
443 let file_content_two = str::from_utf8(&content_two).unwrap();
444 assert_eq!("test,lets see how it reacts. :).", file_content_two);
445 }
446
447 #[test]
448 fn test_seek_method() {
449 let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
450 reader.seek(33);
451 assert_eq!(0, reader.current_read_position.get());
452 assert_eq!(33, reader.pos.get());
453 }
454
455 #[test]
456 fn test_read_to() {
457 let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
458 let mut vec = vec![0; 52];
459 let data_read = reader.read_to(&mut vec);
460 assert_eq!(data_read.unwrap(), 52);
461 assert_eq!(
462 "This is just a bgzf test,lets see how it reacts. :).",
463 str::from_utf8(&vec).unwrap()
464 );
465 }
466
467 #[test]
468 fn test_cache() {
469 let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
470 let mut vec = vec![0; 52];
471 let data_read = reader.read_to(&mut vec);
472 assert_eq!(data_read.unwrap(), 52);
473 assert_eq!(
474 "This is just a bgzf test,lets see how it reacts. :).",
475 str::from_utf8(&vec).unwrap()
476 );
477
478 let mut vec2 = vec![0; 119];
479 let data_read_2 = reader.read_to(&mut vec2);
480 assert_eq!(data_read_2.unwrap(), 119);
481 assert_eq!(
482 " I think it will work fine, but who knows this is still a software. Unless you have tested it 100% there is no guarante",
483 str::from_utf8(&vec2).unwrap()
484 );
485
486 let mut vec3 = vec![0; 2];
487 let data_read_3 = reader.read_to(&mut vec3);
488 assert_eq!(data_read_3.unwrap(), 2);
489 assert_eq!("e ", str::from_utf8(&vec3).unwrap());
490 }
491
492}