bgzf_rust_reader/
lib.rs

1use libdeflater::Decompressor;
2use positioned_io::ReadAt;
3use std::cell::Cell;
4use std::cell::RefCell;
5use std::cmp::min;
6use std::collections::BTreeMap;
7use std::error;
8use std::fs::File;
9use std::ops::Bound::{Excluded, Included};
10use std::str;
11use std::{error::Error, fmt};
12
13/// Struct to hold the block information:
14///
15/// data_offset: pointer of file where real data is located,
16/// data_length: total length of data i.e (block - header - footer,
17/// input_length: uncompressed length of the data,
18/// block_size: length of the block,
19#[derive(Copy, Clone)]
20struct BgzfBlock {
21  data_offset: u64,
22  data_length: u32,
23  input_length: u32,
24  block_size: u32,
25}
26
27///Cache struct to cache uncompressed data of a whole block
28#[derive(Clone)]
29struct Cache {
30  pos: u64,
31  uncompressed_data: Vec<u8>,
32}
33
34/// Struct to read bgzf file
35///
36/// Fields description:
37///
38/// input_length: total length of the uncompressed version,
39/// current_read_position: current position of the compressed file,
40/// pos: current position of the uncompressed file,
41pub struct BgzfReader {
42  bgzf_file: File,
43  block_tree: BTreeMap<u64, BgzfBlock>,
44  cache: RefCell<Option<Cache>>,
45  pub input_length: u64,
46  pub current_read_position: Cell<u64>,
47  pub pos: Cell<u64>,
48}
49
50/// Below are the steps to use the bgzf Reader,
51/// 1st step is to create a BGZF instance with a new function
52/// after that read, and seek method can be used respectively.
53///
54/// # Example
55/// ```
56/// use bgzf_rust_reader::BgzfReader;
57/// use std::str;
58///
59///  let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
60///  let mut vec = vec![0; 52];
61///  let data_read = reader.read_to(&mut vec);
62///  assert_eq!(data_read.unwrap(), 52);
63///  assert_eq!(
64///    "This is just a bgzf test,lets see how it reacts. :).",
65///    str::from_utf8(&vec).unwrap()
66///  );
67///
68/// ```
69impl BgzfReader {
70  pub fn new(file_path: String) -> Result<BgzfReader, Box<dyn error::Error>> {
71    let mut b_tree = BTreeMap::new();
72    let bgzf_file = File::open(file_path)?;
73    let mut input_offset: u64 = 0;
74    let mut current_file_position = 0;
75    loop {
76      match read_block(&bgzf_file, current_file_position) {
77        Ok(option_block) => match option_block {
78          Some(block) => {
79            let input_length_block = block.input_length;
80            let block_size_block = block.block_size;
81            b_tree.insert(input_offset, block);
82            input_offset += u64::from(input_length_block);
83            current_file_position += u64::from(block_size_block);
84          }
85          None => break,
86        },
87        Err(_e) => break,
88      }
89    }
90    let reader = BgzfReader {
91      bgzf_file,
92      block_tree: b_tree,
93      input_length: input_offset,
94      current_read_position: Cell::new(0),
95      pos: Cell::new(0),
96      cache: RefCell::new(None),
97    };
98    Ok(reader)
99  }
100
101  /// This method can set the file position relative to uncompressed data
102  ///
103  /// # Example
104  /// ```
105  /// use bgzf_rust_reader::BgzfReader;
106  ///
107  ///let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
108  ///reader.seek(33);
109  /// assert_eq!(0, reader.current_read_position.get());
110  ///assert_eq!(33, reader.pos.get());
111  ///
112  /// ```
113  pub fn seek(&self, pos: u64) {
114    self.pos.set(pos);
115  }
116
117  /// This method calculates total uncompressed length
118  pub fn total_uncompressed_length(&self) -> u64 {
119    self.input_length
120  }
121
122  /// this method reads data to the slice passed
123  ///
124  /// # Example
125  /// ```
126  /// use bgzf_rust_reader::BgzfReader;
127  /// use std::str;
128  ///
129  ///  let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
130  ///  let mut vec = vec![0; 52];
131  ///  let data_read = reader.read_to(&mut vec);
132  ///  assert_eq!(data_read.unwrap(), 52);
133  ///  assert_eq!(
134  ///    "This is just a bgzf test,lets see how it reacts. :).",
135  ///    str::from_utf8(&vec).unwrap()
136  ///  );
137  ///
138  /// ```
139  pub fn read_to(&self, b: &mut Vec<u8>) -> Result<i32, Box<dyn error::Error>> {
140    self.read(b, 0, b.len())
141  }
142
143  /// this method reads data to the slice from offset position,
144  /// up to the len position
145  ///
146  /// # Example
147  /// ```
148  /// use bgzf_rust_reader::BgzfReader;
149  /// use std::str;
150  ///
151  /// let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
152  /// let mut content = vec![0; 10];
153  /// match reader.read(&mut content, 0, 10) {
154  ///  Ok(val) => {
155  ///   assert_eq!(10, val);
156  ///  }
157  ///  Err(e) => {
158  ///    assert!(false);
159  ///  }
160  /// };
161  ///let file_content = str::from_utf8(&content).unwrap();
162  ///  assert_eq!("This is ju", file_content);
163  ///
164  /// ```
165  pub fn read(
166    &self,
167    b: &mut Vec<u8>,
168    off: usize,
169    len: usize,
170  ) -> Result<i32, Box<dyn error::Error>> {
171    if b.len() == 0 {
172      return Err(BGZFError::new("Buffer size needs to be greater than 0").into());
173    }
174    if len > b.len() - off {
175      return Err(BGZFError::new("Index out of bound exception").into());
176    }
177    if len == 0 {
178      return Ok(0);
179    }
180    if self.pos.get() >= self.input_length {
181      return Ok(-1);
182    }
183
184    let mut off = off;
185    let mut len = len;
186    let mut cb: i32 = 0;
187
188    match self.cache.borrow().as_ref() {
189      Some(cache) => {
190        if self.pos.get() >= cache.pos {
191          let bytes_available_in_cache =
192            cache.pos as usize + cache.uncompressed_data.len() - self.pos.get() as usize;
193          if bytes_available_in_cache > 0 {
194            let copy_start = (self.pos.get() - cache.pos) as usize;
195            let copy_length = min(bytes_available_in_cache, len);
196            let end_index = copy_start + copy_length;
197            b[off..]
198              .copy_from_slice(&cache.uncompressed_data[copy_start as usize..end_index as usize]);
199            cb += copy_length as i32;
200            off += copy_length;
201            len -= copy_length;
202            self.pos.set(self.pos.get() + copy_length as u64);
203            if len == 0 {
204              return Ok(cb);
205            }
206          }
207        }
208      }
209      None => {
210        //If there is no cache available lets move forward
211      }
212    }
213
214    let mut un_compressor = Decompressor::new();
215
216    #[derive(Copy, Clone)]
217    struct Entry {
218      key: u64,
219      value: BgzfBlock,
220    }
221
222    let mut entry_vector: Vec<Entry> = Vec::new();
223
224    if !self.block_tree.contains_key(&self.pos.get()) {
225      let floored_value = self.block_tree.range(..self.pos.get()).next_back().unwrap();
226      //Getting a floored value if we do not find pos in the tree.
227      entry_vector.push(Entry {
228        key: *floored_value.0,
229        value: *floored_value.1,
230      });
231    }
232    //Get all the blocks from the block tree that is within the range of
233    //pos and length of the buffer passed
234    let pos_and_len_combined = self.pos.get() + len as u64;
235    for (&key, &value) in self
236      .block_tree
237      .range((Included(self.pos.get()), Excluded(pos_and_len_combined)))
238    {
239      entry_vector.push(Entry { key, value });
240    }
241
242    for entry in entry_vector {
243      let block = entry.value;
244      let input_offset = entry.key;
245
246      //Reading compressed data from the block
247      let mut compressed = vec![0u8; block.data_length as usize];
248      self
249        .bgzf_file
250        .read_exact_at(block.data_offset, &mut compressed)?;
251
252      //now it's time to de-compress the read value obtained.
253      let mut uncompressed = vec![0u8; block.input_length as usize];
254      let bytes_decompressed =
255        un_compressor.deflate_decompress(&mut compressed, &mut uncompressed)?;
256
257      if bytes_decompressed == 0 || bytes_decompressed != block.input_length as usize {
258        return Err(BGZFError::new("Did not fully de-compress").into());
259      }
260
261      self.cache.replace(Some(Cache {
262        pos: input_offset,
263        uncompressed_data: uncompressed.clone(),
264      }));
265
266      let mut copy_start: u64 = 0;
267      //total uncompressed size is input_length
268      let mut copy_length = block.input_length;
269      if input_offset < self.pos.get() {
270        let copy_skip = self.pos.get() - input_offset;
271        copy_start += copy_skip;
272        copy_length -= copy_skip as u32;
273      }
274
275      if copy_length > len as u32 {
276        copy_length = len as u32;
277      }
278      let end_index = copy_start + u64::from(copy_length);
279      b[off..].copy_from_slice(&uncompressed[copy_start as usize..end_index as usize]);
280      len -= copy_length as usize;
281      self.pos.set(self.pos.get() + u64::from(copy_length));
282      off += copy_length as usize;
283      cb += copy_length as i32;
284    }
285    Ok(cb)
286  }
287}
288
289fn read_block(
290  file: &File,
291  current_file_position: u64,
292) -> Result<Option<BgzfBlock>, Box<dyn error::Error>> {
293  let mut current_file_position = current_file_position;
294
295  let mut buf = [0; 12];
296  file.read_exact_at(current_file_position, &mut buf)?;
297  current_file_position += buf.len() as u64;
298
299  if buf[0] != 31 || buf[1] != 139 || buf[2] != 8 || buf[3] != 4 {
300    return Err(BGZFError::new("Incorrect header").into());
301  }
302
303  let xlen: u16 = (buf[10] as u16) | ((buf[11] as u16) << 8);
304
305  let mut buf_xlen = vec![0u8; usize::from(xlen)];
306
307  file.read_exact_at(current_file_position, &mut buf_xlen)?;
308  current_file_position += buf_xlen.len() as u64;
309
310  if buf_xlen[0] != 66 || buf_xlen[1] != 67 {
311    return Err(BGZFError::new("Bad subfield Identifier").into());
312  }
313
314  if ((buf_xlen[2] as u16) | ((buf_xlen[3] as u16) << 8)) != 2 {
315    return Err(BGZFError::new("Bad subfield Length").into());
316  }
317
318  let bsize = (buf_xlen[4] as u16) | ((buf_xlen[5] as u16) << 8);
319  let block_size = u32::from(bsize) + 1;
320  let data_length = bsize - xlen - 19;
321  let data_offset = current_file_position;
322
323  //Skip data block
324  current_file_position += u64::from(data_length) + 4;
325
326  let mut buf_isize = [0; 4];
327  file.read_exact_at(current_file_position, &mut buf_isize)?;
328
329  let i_size: u32 = (buf_isize[0] as u32)
330    | ((buf_isize[1] as u32) << 8)
331    | ((buf_isize[2] as u32) << 16)
332    | ((buf_isize[3] as u32) << 24);
333
334  if i_size == 0 {
335    return Ok(None);
336  }
337
338  let block = BgzfBlock {
339    data_offset,
340    data_length: u32::from(data_length),
341    input_length: u32::from(i_size),
342    block_size,
343  };
344  Ok(Some(block))
345}
346
347#[derive(Debug)]
348struct BGZFError {
349  msg: String,
350}
351
352impl BGZFError {
353  fn new(msg: &str) -> BGZFError {
354    BGZFError {
355      msg: msg.to_string(),
356    }
357  }
358}
359
360impl Error for BGZFError {
361  fn description(&self) -> &str {
362    &self.msg
363  }
364}
365
366impl fmt::Display for BGZFError {
367  fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
368    write!(f, "{}", self.msg)
369  }
370}
371
372#[cfg(test)]
373mod tests {
374  use super::*;
375
376  #[test]
377  fn test_read_block_func() {
378    let bgzf_file = File::open("bgzf_test.bgz").unwrap();
379    match read_block(&bgzf_file, 0) {
380      Ok(option_block) => match option_block {
381        Some(block) => {
382          assert_eq!(block.block_size, 211);
383          assert_eq!(block.data_length, 185);
384          assert_eq!(block.data_offset, 18);
385          assert_eq!(block.input_length, 280);
386        }
387        None => assert!(false),
388      },
389      Err(_e) => assert!(false),
390    }
391  }
392
393  #[test]
394  fn test_bgzf_reader_new_func() {
395    let bgzf_reader = BgzfReader::new(String::from("bgzf_test.bgz"));
396    match bgzf_reader {
397      Ok(reader) => {
398        let expected_uncompressed_length = 280;
399        assert_eq!(1, reader.block_tree.len());
400        assert_eq!(expected_uncompressed_length, reader.input_length);
401        assert_eq!(0, reader.current_read_position.get());
402
403        let block = reader.block_tree.get(&0);
404        match block {
405          Some(block) => {
406            assert_eq!(block.block_size, 211);
407            assert_eq!(block.data_length, 185);
408            assert_eq!(block.data_offset, 18);
409            assert_eq!(block.input_length, 280);
410          }
411          None => assert!(false),
412        }
413      }
414      Err(_e) => assert!(false),
415    }
416  }
417
418  #[test]
419  fn test_bgzf_read_method() {
420    let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
421    let mut content = vec![0; 10];
422    match reader.read(&mut content, 0, 10) {
423      Ok(val) => {
424        assert_eq!(10, val);
425      }
426      Err(e) => {
427        assert!(false);
428      }
429    };
430    let file_content = str::from_utf8(&content).unwrap();
431    assert_eq!("This is ju", file_content);
432
433    reader.seek(20);
434    let mut content_two = vec![0; 32];
435    match reader.read(&mut content_two, 0, 32) {
436      Ok(val) => {
437        assert_eq!(32, val);
438      }
439      Err(_e) => {
440        assert!(false);
441      }
442    };
443    let file_content_two = str::from_utf8(&content_two).unwrap();
444    assert_eq!("test,lets see how it reacts. :).", file_content_two);
445  }
446
447  #[test]
448  fn test_seek_method() {
449    let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
450    reader.seek(33);
451    assert_eq!(0, reader.current_read_position.get());
452    assert_eq!(33, reader.pos.get());
453  }
454
455  #[test]
456  fn test_read_to() {
457    let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
458    let mut vec = vec![0; 52];
459    let data_read = reader.read_to(&mut vec);
460    assert_eq!(data_read.unwrap(), 52);
461    assert_eq!(
462      "This is just a bgzf test,lets see how it reacts. :).",
463      str::from_utf8(&vec).unwrap()
464    );
465  }
466
467  #[test]
468  fn test_cache() {
469    let reader = BgzfReader::new(String::from("bgzf_test.bgz")).unwrap();
470    let mut vec = vec![0; 52];
471    let data_read = reader.read_to(&mut vec);
472    assert_eq!(data_read.unwrap(), 52);
473    assert_eq!(
474      "This is just a bgzf test,lets see how it reacts. :).",
475      str::from_utf8(&vec).unwrap()
476    );
477
478    let mut vec2 = vec![0; 119];
479    let data_read_2 = reader.read_to(&mut vec2);
480    assert_eq!(data_read_2.unwrap(), 119);
481    assert_eq!(
482    " I think it will work fine, but who knows this is still a software. Unless you have tested it 100% there is no guarante",
483      str::from_utf8(&vec2).unwrap()
484    );
485
486    let mut vec3 = vec![0; 2];
487    let data_read_3 = reader.read_to(&mut vec3);
488    assert_eq!(data_read_3.unwrap(), 2);
489    assert_eq!("e ", str::from_utf8(&vec3).unwrap());
490  }
491
492}