bcf_reader/lib.rs
1//! # bcf_reader
2//! This is an attempt to create a small, lightweight, pure-Rust library to allow
3//! efficient, cross-platform access to genotype data in BCF files.
4//!
5//! Currently, the `rust_htslib` crate works only on Linux and macOS (not Windows?).
6//! The noodles crate is a pure Rust library for many bioinformatic file formats and
7//! works across Windows, Linux, and macOS. However, the `noodles` API for reading
8//! genotype data from BCF files can be slow due to its memory allocation patterns.
9//! Additionally, both crates have a large number of dependencies, as they provide
10//! many features and support a wide range of file formats.
11//!
12//! One way to address the memory allocation and dependency issues is to manually
13//! parse BCF records according to its specification
14//! (`<https://samtools.github.io/hts-specs/VCFv4.2.pdf>`) and use iterators whenever
15//! possible, especially for the per-sample fields, like GT and AD.
16//!
17//! Note: This crate is in its early stages of development.
18//!
19//! ## Usage
20//! ```
21//! use bcf_reader::*;
22//! let mut reader = smart_reader("testdata/test2.bcf").unwrap();
23//! let header = Header::from_string(&read_header(&mut reader).unwrap()).unwrap();
24//! // find key for a field in INFO or FORMAT or FILTER
25//! let key = header.get_idx_from_dictionary_str("FORMAT", "GT").unwrap();
26//! // access header dictionary
27//! let d = &header.dict_strings()[&key];
28//! assert_eq!(d["ID"], "GT");
29//! assert_eq!(d["Dictionary"], "FORMAT");
30//! /// get chromosome name
31//! assert_eq!(header.get_chrname(0), "Pf3D7_01_v3");
32//! let fmt_ad_key = header
33//! .get_idx_from_dictionary_str("FORMAT", "AD")
34//! .expect("FORMAT/AD not found");
35//! let info_af_key = header
36//! .get_idx_from_dictionary_str("INFO", "AF")
37//! .expect("INFO/AF not found");
38//!
39//! // this can be and should be reused to reduce allocation
40//! let mut record = Record::default();
41//! while let Ok(_) = record.read(&mut reader) {
42//! let pos = record.pos();
43//!
44//! // use byte ranges and shared buffer to get allele string values
45//! let allele_byte_ranges = record.alleles();
46//! let share_buf = record.buf_shared();
47//! let ref_rng = &allele_byte_ranges[0];
48//! let ref_allele_str = std::str::from_utf8(&share_buf[ref_rng.clone()]).unwrap();
49//! let alt1_rng = &allele_byte_ranges[1];
50//! let alt1_allele_str = std::str::from_utf8(&share_buf[alt1_rng.clone()]).unwrap();
51//! // ...
52//!
53//! // access FORMAT/GT via iterator
54//! for nv in record.fmt_gt(&header) {
55//! let nv = nv.unwrap();
56//! let (has_no_ploidy, is_missing, is_phased, allele_idx) = nv.gt_val();
57//! // ...
58//! }
59//!
60//! // access FORMAT/AD via iterator
61//! for nv in record.fmt_field(fmt_ad_key) {
62//! let nv = nv.unwrap();
63//! match nv.int_val() {
64//! None => {}
65//! Some(ad) => {
66//! // ...
67//! }
68//! }
69//! // ...
70//! }
71//!
72//! // access FILTERS via itertor
73//! record.filters().for_each(|nv| {
74//! let nv = nv.unwrap();
75//! let filter_key = nv.int_val().unwrap() as usize;
76//! let dict_string_map = &header.dict_strings()[&filter_key];
77//! let filter_name = &dict_string_map["ID"];
78//! // ...
79//! });
80//!
81//! // access INFO/AF via itertor
82//! record.info_field_numeric(info_af_key).for_each(|nv| {
83//! let nv = nv.unwrap();
84//! let af = nv.float_val().unwrap();
85//! // ...
86//! });
87//! }
88//! ```
89//!
90//! More examples to access each field/column are available in docs of [`Record`] and [`Header`].
91//!
92//! # Reader types
93//! - For parallelized decompression reader, see [`BcfReader`].
94//! - For parallelized indexed reader, see [ `IndexedBcfReader`].
95//! - For the Lower-level reader underlying `BcfReader` and `IndexedBcfReader`,
96//! see [`ParMultiGzipReader`].
97//!
98//! # `flate2` backends
99//!
100//! By default, a rust backend is used. Other `flate2` backends `zlib` and
101//! `zlib-ng-compat` has been exported as the corresponding features (`zlib` and
102//! `zlib-ng-compat`). See <https://docs.rs/flate2/latest/flate2/> for more details.
103//!
104#![warn(clippy::unwrap_used, clippy::expect_used, clippy::dbg_macro)]
105use byteorder::{LittleEndian, ReadBytesExt};
106use flate2::bufread::DeflateDecoder;
107use rayon::prelude::*;
108use std::fmt::Debug;
109use std::fs::File;
110use std::io;
111use std::io::BufReader;
112use std::io::Read;
113use std::ops::Range;
114use std::path::Path;
115use std::{collections::HashMap, io::Seek};
116
117pub type Result<T> = std::result::Result<T, Error>;
118
119#[derive(Debug)]
120pub enum Error {
121 Io(std::io::Error),
122 HeaderNotParsed,
123 IndexBcfReaderMissingGenomeInterval,
124 CsBinIndexNotFound,
125 FromUtf8Error(std::string::FromUtf8Error),
126 Utf8Error(std::str::Utf8Error),
127 ParseHeaderError(ParseHeaderError),
128 NumericaValueEmptyInt,
129 NumericaValueAsF32Error,
130 Other(String),
131}
132
133#[derive(Debug)]
134pub struct ParseGzipHeaderError {
135 pub key: &'static str,
136 pub expected: usize,
137 pub found: usize,
138}
139
140#[derive(Debug)]
141pub enum ParseHeaderError {
142 HeaderCommentCharError,
143 MissingDictionaryname,
144 FormatError(&'static str),
145}
146
147trait AddContext {
148 fn add_context(self, context: &'static str) -> Error;
149}
150impl AddContext for std::io::Error {
151 fn add_context(self, context: &'static str) -> Error {
152 Error::from(std::io::Error::new(self.kind(), context))
153 }
154}
155
156// --- Display
157impl std::fmt::Display for Error {
158 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
159 write!(f, "{:?}", *self)
160 }
161}
162impl std::fmt::Display for ParseHeaderError {
163 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
164 write!(f, "{:?}", *self)
165 }
166}
167impl std::fmt::Display for ParseGzipHeaderError {
168 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169 write!(f, "{:?}", *self)
170 }
171}
172
173// --- Error
174
175impl std::error::Error for Error {}
176
177impl std::error::Error for ParseGzipHeaderError {}
178
179// --- From
180impl From<ParseGzipHeaderError> for std::io::Error {
181 fn from(value: ParseGzipHeaderError) -> Self {
182 std::io::Error::new(std::io::ErrorKind::InvalidData, Box::new(value))
183 }
184}
185impl From<std::io::Error> for Error {
186 fn from(value: std::io::Error) -> Self {
187 Error::Io(value)
188 }
189}
190
191/// An iterator used to split a `str` by a separator with separators within pairs
192/// of quotes ignored.
193pub struct QuotedSplitter<'a> {
194 data: &'a str,
195 in_quotes: bool,
196 sep: char,
197 quote: char,
198}
199
200impl<'a> QuotedSplitter<'a> {
201 /// Creates a new `QuotedSplitter` iterator.
202 ///
203 /// # Arguments
204 ///
205 /// * `buffer` - The input string to be split.
206 /// * `separator` - The separator character used to split the string.
207 /// * `quote` - The quote character used to ignore separators within quotes.
208 ///
209 /// # Examples
210 ///
211 /// ```
212 /// use bcf_reader::QuotedSplitter;
213 /// let input_string = "hello,\"world, this is fun\",test";
214 /// let result: Vec<_> = QuotedSplitter::new(input_string, ',', '"').collect();
215 /// assert_eq!(result, vec!["hello", "\"world, this is fun\"", "test"]);
216 /// ```
217 pub fn new(buffer: &'a str, separator: char, quote: char) -> Self {
218 Self {
219 data: buffer,
220 in_quotes: false,
221 sep: separator,
222 quote,
223 }
224 }
225}
226
227impl<'a> Iterator for QuotedSplitter<'a> {
228 type Item = &'a str;
229
230 /// Advances the iterator and returns the next split substring.
231 ///
232 /// # Returns
233 ///
234 /// * `Some(&str)` - The next split substring.
235 /// * `None` - If there are no more substrings to split.
236 fn next(&mut self) -> Option<Self::Item> {
237 for (idx, ch) in self.data.char_indices() {
238 if ch == self.quote {
239 self.in_quotes = !self.in_quotes;
240 }
241 if (!self.in_quotes) && ch == self.sep {
242 let (out, remain) = self.data.split_at(idx);
243 self.data = remain.strip_prefix(self.sep).unwrap_or(remain);
244 return Some(out);
245 }
246 }
247 if !self.data.is_empty() {
248 let out = self.data;
249 self.data = "";
250 Some(out)
251 } else {
252 None
253 }
254 }
255}
256
257/// Represents a header of a BCF file.
258///
259/// The `Header` struct contains information about the dictionar of strings and
260/// contigs, samples, and the index of the FORMAT field "GT". It provides
261/// methods to parse a header from a string, retrieve the index of a dictionary
262/// string, retrieve the chromosome name by index, retrieve the index of the
263/// FORMAT field "GT", and access the dictionary strings, contigs, and samples.
264///
265/// # Examples
266///
267/// ```
268/// use bcf_reader::Header;
269///
270/// let header_text = concat!(
271/// r#"##fileformat=VCFv4.3"#,
272/// "\n",
273/// r#"##FILTER=<ID=PASS,Description="All filters passed">"#,
274/// "\n",
275/// r#"##FILTER=<ID=FAILED1,Description="failed due to something">"#,
276/// "\n",
277/// r#"##contig=<ID=chr1,length=123123123>"#,
278/// "\n",
279/// r#"##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">"#,
280/// "\n",
281/// r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#,
282/// "\n",
283/// "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1\tsample2",
284/// "\n",
285/// );
286///
287/// let header = Header::from_string(&header_text).unwrap();
288///
289/// assert_eq!(header.get_idx_from_dictionary_str("INFO", "DP"), Some(2));
290/// assert_eq!(header.get_chrname(0), "chr1");
291/// assert_eq!(header.get_fmt_gt_id(), Some(3));
292/// assert_eq!(header.dict_contigs().len(), 1);
293/// assert_eq!(header.dict_strings().len(), 4);
294/// assert_eq!(header.get_samples().len(), 2);
295/// ```
296#[derive(Debug)]
297pub struct Header {
298 dict_strings: HashMap<usize, HashMap<String, String>>,
299 dict_contigs: HashMap<usize, HashMap<String, String>>,
300 samples: Vec<String>,
301 fmt_gt_idx: Option<usize>,
302}
303impl Header {
304 /// parse header lines to structured data `Header`
305 pub fn from_string(text: &str) -> std::result::Result<Self, ParseHeaderError> {
306 use ParseHeaderError::*;
307 let mut dict_strings = HashMap::<usize, HashMap<String, String>>::new();
308 let mut dict_contigs = HashMap::<usize, HashMap<String, String>>::new();
309 let mut samples = Vec::<String>::new();
310
311 // implicit FILTER/PASS header line
312 let mut m = HashMap::<String, String>::new();
313 m.insert("Dictionary".into(), "FILTER".into());
314 m.insert("ID".into(), "PASS".into());
315 m.insert("Description".into(), r#""All filters passed""#.into());
316 dict_strings.insert(0, m);
317 //
318 let mut dict_str_idx_counter = 1;
319 let mut dict_contig_idx_counter = 0;
320 for line in QuotedSplitter::new(text.trim_end_matches('\0').trim(), '\n', '"') {
321 if line.starts_with("#CHROM") {
322 line.split("\t")
323 .skip(9)
324 .for_each(|s| samples.push(s.into()));
325 continue;
326 }
327 if line.trim().is_empty() {
328 continue;
329 }
330 let mut it = QuotedSplitter::new(
331 line.strip_prefix("##").ok_or(HeaderCommentCharError)?,
332 '=',
333 '"',
334 );
335 let dict_name = it.next().ok_or(MissingDictionaryname)?;
336 let valid_dict = matches!(it.next(), Some(x) if x.starts_with("<"));
337 if !valid_dict {
338 continue;
339 }
340 let l = line.find('<').ok_or(FormatError("'<' not found"))?;
341 let s = line.split_at(l + 1).1;
342 let r = s.rfind('>').ok_or(FormatError("> not found"))?;
343 let s = s.split_at(r).0;
344 let mut m = HashMap::<String, String>::new();
345 for kv_str in QuotedSplitter::new(s, ',', '"') {
346 let kv_str = kv_str.trim();
347
348 let mut it = QuotedSplitter::new(kv_str, '=', '"');
349 let k = it.next().ok_or(FormatError("key not found"))?;
350 let v = it
351 .next()
352 .ok_or(FormatError("value not found"))?
353 .trim_end_matches('"')
354 .trim_start_matches('"');
355 m.insert(k.into(), v.into());
356 }
357 match dict_name {
358 "contig" => {
359 if m.contains_key("IDX") {
360 let idx: usize = m["IDX"]
361 .parse()
362 .map_err(|_| FormatError("IDX value parsing error"))?;
363 if dict_contig_idx_counter != 0 {
364 // if one dict string has IDX all of them should have IDX in the dictionary
365 return Err(FormatError("not all CONTIG lines have key IDX"));
366 }
367 dict_contigs.insert(idx, m);
368 } else {
369 dict_contigs.insert(dict_contig_idx_counter, m);
370 dict_contig_idx_counter += 1;
371 }
372 }
373 _ => {
374 if ((dict_name != "FILTER") || (&m["ID"] != "PASS"))
375 && ["INFO", "FILTER", "FORMAT"].iter().any(|x| *x == dict_name)
376 {
377 m.insert("Dictionary".into(), dict_name.into());
378 // dbg!(&m, dict_str_idx_counter);
379 if m.contains_key("IDX") {
380 let idx: usize = m["IDX"]
381 .parse()
382 .map_err(|_| FormatError("IDX value parsing error"))?;
383 if dict_str_idx_counter != 1 {
384 // if one dict string has IDX all of them should have IDX in the dictionary
385 return Err(FormatError(
386 "not all INFO/FILTER/FORMAT lines have key IDX",
387 ));
388 }
389 dict_strings.insert(idx, m);
390 } else {
391 dict_strings.insert(dict_str_idx_counter, m);
392 dict_str_idx_counter += 1;
393 }
394 }
395 }
396 };
397 }
398
399 // find fmt_key for FORMAT/GT for convenience
400 let mut fmt_gt_idx = None;
401 for (k, m) in dict_strings.iter() {
402 if (&m["Dictionary"] == "FORMAT") && (&m["ID"] == "GT") {
403 fmt_gt_idx = Some(*k);
404 }
405 }
406
407 Ok(Self {
408 dict_strings,
409 dict_contigs,
410 samples,
411 fmt_gt_idx,
412 })
413 }
414
415 /// Find the key (offset in header line) for a given INFO/xx or FILTER/xx or FORMAT/xx field.
416 ///
417 /// Example:
418 /// ```
419 /// use bcf_reader::*;
420 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
421 /// let s = read_header(&mut f).unwrap();
422 /// let header = Header::from_string(&s).unwrap();
423 /// let key_found = header.get_idx_from_dictionary_str("FORMAT", "GT").unwrap();
424 /// assert_eq!(key_found, header.get_fmt_gt_id().unwrap());
425 /// ```
426 pub fn get_idx_from_dictionary_str(&self, dictionary: &str, field: &str) -> Option<usize> {
427 for (k, m) in self.dict_strings.iter() {
428 if (m["Dictionary"] == dictionary) && (m["ID"] == field) {
429 return Some(*k);
430 }
431 }
432 None
433 }
434
435 /// Get chromosome name from the contig index
436 pub fn get_chrname(&self, idx: usize) -> &str {
437 &self.dict_contigs[&idx]["ID"]
438 }
439
440 /// Get key for FORMAT/GT field.
441 pub fn get_fmt_gt_id(&self) -> Option<usize> {
442 self.fmt_gt_idx
443 }
444
445 /// Get hashmap of hashmap of dictionary of contigs
446 /// outer key: contig_idx
447 /// inner key: is the key of the dictionary of contig, such as 'ID', 'Description'
448 pub fn dict_contigs(&self) -> &HashMap<usize, HashMap<String, String>> {
449 &self.dict_contigs
450 }
451
452 /// Get hashmap of hashmap of dictionary of strings
453 /// outer key: item_idx, for FILTER/xx, FORMAT/xx, INFO/xx,
454 /// inner key: is the key of the dictionary of string, such as 'ID', 'Description'
455 pub fn dict_strings(&self) -> &HashMap<usize, HashMap<String, String>> {
456 &self.dict_strings
457 }
458
459 /// Get samples names from sample idx
460 /// Example:
461 /// ```
462 /// use bcf_reader::*;
463 /// // read data generated by bcftools
464 /// // bcftools query -l test.bcf | bgzip -c > test_samples.gz
465 /// let mut samples_str = String::new();
466 /// smart_reader("./testdata/test_samples.gz")
467 /// .unwrap()
468 /// .read_to_string(&mut samples_str)
469 /// .unwrap();
470 /// // read data via bcf-reader
471 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
472 /// let s = read_header(&mut f).unwrap();
473 /// let header = Header::from_string(&s).unwrap();
474 /// let samples_str2 = header.get_samples().join("\n");
475 /// // compare bcftools results and bcf-reader results
476 /// assert_eq!(samples_str.trim(), samples_str2.trim());
477 /// ```
478 pub fn get_samples(&self) -> &Vec<String> {
479 &self.samples
480 }
481}
482
483/// map bcf2 type to width in bytes
484///
485/// `typ`:
486/// - 0: MISSING
487/// - 1: u8 (1 byte)
488/// - 2: u16 (2 bytes)
489/// - 3: u32 (3 bytes)
490/// - 5: f32 (4 bytes)
491/// - 7: c-char (u8, 1 byte)
492pub fn bcf2_typ_width(typ: u8) -> usize {
493 match typ {
494 0x0 => 0,
495 0x1 => 1,
496 0x2 => 2,
497 0x3 => 4,
498 0x5 => 4,
499 0x7 => 1,
500 _ => panic!(),
501 }
502}
503
504#[derive(Debug, PartialEq)]
505/// Represents a numeric value in the context of the bcf-reader.
506pub enum NumericValue {
507 /// Represents an unsigned 8-bit integer value.
508 U8(u8),
509 /// Represents an unsigned 16-bit integer value.
510 U16(u16),
511 /// Represents an unsigned 32-bit integer value.
512 U32(u32),
513 /// Represents a 32-bit floating-point value. (Note that a u32 is used to
514 /// hold the bits for the f32 value)
515 F32(u32),
516}
517
518impl Default for NumericValue {
519 fn default() -> Self {
520 NumericValue::U8(0)
521 }
522}
523
524impl From<u8> for NumericValue {
525 fn from(value: u8) -> Self {
526 Self::U8(value)
527 }
528}
529impl From<u16> for NumericValue {
530 fn from(value: u16) -> Self {
531 Self::U16(value)
532 }
533}
534impl From<u32> for NumericValue {
535 fn from(value: u32) -> Self {
536 Self::U32(value)
537 }
538}
539
540impl NumericValue {
541 fn is_missing(&self) -> bool {
542 match *self {
543 NumericValue::U8(x) => x == 0x80,
544 NumericValue::U16(x) => x == 0x8000,
545 NumericValue::U32(x) => x == 0x80000000,
546 NumericValue::F32(x) => x == 0x7F800001,
547 }
548 }
549
550 fn is_end_of_vector(&self) -> bool {
551 match *self {
552 NumericValue::U8(x) => x == 0x81,
553 NumericValue::U16(x) => x == 0x8001,
554 NumericValue::U32(x) => x == 0x80000001,
555 NumericValue::F32(x) => x == 0x7F800002,
556 }
557 }
558
559 fn as_f32(&self) -> Result<Self> {
560 match *self {
561 NumericValue::U32(x) => Ok(NumericValue::F32(x)),
562 _ => Err(Error::NumericaValueAsF32Error),
563 }
564 }
565
566 /// Returns the integer value if the NumericValue is an unsigned integer and not missing.
567 ///
568 /// # Examples
569 ///
570 /// ```
571 /// use bcf_reader::NumericValue;
572 ///
573 /// let value = NumericValue::U8(42);
574 /// assert_eq!(value.int_val(), Some(42u32));
575 ///
576 /// let missing_value = NumericValue::U8(0x80u8);
577 /// assert_eq!(missing_value.int_val(), None);
578 /// ```
579 pub fn int_val(&self) -> Option<u32> {
580 if self.is_end_of_vector() || self.is_missing() {
581 None
582 } else {
583 match *self {
584 Self::U8(x) => Some(x as u32),
585 Self::U16(x) => Some(x as u32),
586 Self::U32(x) => Some(x),
587 _ => None,
588 }
589 }
590 }
591
592 /// Returns the floating-point value if the NumericValue is a 32-bit float and not missing.
593 ///
594 /// # Examples
595 ///
596 /// ```
597 /// use bcf_reader::NumericValue;
598 ///
599 /// let value = NumericValue::F32(3.14f32.to_bits());
600 /// assert_eq!(value.float_val(), Some(3.14f32));
601 ///
602 /// let missing_value = NumericValue::F32(0x7F800001);
603 /// let missing_value2 = NumericValue::F32(0x7F800001);
604 /// assert_eq!(missing_value.float_val(), missing_value2.float_val());
605 /// dbg!(&missing_value);
606 /// assert_eq!(missing_value.float_val(), None);
607 /// ```
608 pub fn float_val(&self) -> Option<f32> {
609 if self.is_end_of_vector() || self.is_missing() {
610 None
611 } else {
612 match *self {
613 Self::F32(x) => Some(f32::from_bits(x)),
614 _ => panic!(),
615 }
616 }
617 }
618
619 /// Returns a tuple representing the GT value.
620 ///
621 /// The tuple contains the following elements:
622 /// - `noploidy`: A boolean indicating if the ploidy is missing (for individuals with fewer ploids compared to individuals with the maximum ploidy).
623 /// - `dot`: A boolean indicating if the genotype call is a dot.
624 /// - `phased`: A boolean indicating if the allele is phased (the first allele of a call is always unphased).
625 /// - `allele`: The allele value (index).
626 ///
627 /// # Examples
628 ///
629 /// ```
630 /// use bcf_reader::NumericValue;
631 ///
632 /// let value = NumericValue::U8(3);
633 /// assert_eq!(value.gt_val(), (false, false, true, 0));
634 ///
635 /// let value = NumericValue::U8(5);
636 /// assert_eq!(value.gt_val(), (false, false, true, 1));
637 ///
638 /// let missing_value = NumericValue::U8(0);
639 /// assert_eq!(missing_value.gt_val(), (false, true, false, u32::MAX));
640 /// ```
641 pub fn gt_val(&self) -> (bool, bool, bool, u32) {
642 let mut noploidy = false;
643 let mut dot = false;
644 let mut phased = false;
645 let mut allele = u32::MAX;
646
647 match self.int_val() {
648 None => {
649 noploidy = true;
650 }
651 Some(int_val) => {
652 phased = (int_val & 0x1) != 0;
653
654 let int_val = int_val >> 1;
655 if int_val == 0 {
656 dot = true;
657 } else {
658 allele = int_val - 1;
659 }
660 }
661 };
662
663 (noploidy, dot, phased, allele)
664 }
665}
666
667/// Read typed descriptor from the reader (of decompressed BCF buffer)
668///
669/// Return `typ` for type and `n` for count of elements of the type.
670pub fn read_typed_descriptor_bytes<R>(reader: &mut R) -> std::io::Result<(u8, usize)>
671where
672 R: std::io::Read + ReadBytesExt,
673{
674 let tdb = reader.read_u8()?;
675 let typ = tdb & 0xf;
676 let mut n = (tdb >> 4) as usize;
677 if n == 15 {
678 n = read_single_typed_integer(reader)? as usize;
679 }
680 Ok((typ, n))
681}
682
683/// Read a single typed integer from the reader (of decompressed BCF buffer)
684pub fn read_single_typed_integer<R>(reader: &mut R) -> std::io::Result<u32>
685where
686 R: std::io::Read + ReadBytesExt,
687{
688 let (typ, n) = read_typed_descriptor_bytes(reader)?;
689 assert_eq!(n, 1);
690 Ok(match typ {
691 1 => reader.read_u8()? as u32,
692 2 => reader.read_u16::<LittleEndian>()? as u32,
693 3 => reader.read_u32::<LittleEndian>()?,
694 _ => panic!(),
695 })
696}
697
698/// Iterator for accessing arrays of numeric values (integers or floats)
699/// directly from the buffer bytes without building Vec<_> or Vec<Vec<_>>
700/// for each site.
701#[derive(Default, Debug)]
702pub struct NumericValueIter<'r> {
703 reader: std::io::Cursor<&'r [u8]>,
704 typ: u8,
705 len: usize,
706 cur: usize,
707}
708
709impl<'r> Iterator for NumericValueIter<'r> {
710 type Item = std::io::Result<NumericValue>;
711 fn next(&mut self) -> Option<Self::Item> {
712 if self.cur >= self.len {
713 None
714 } else {
715 match self.typ {
716 0 => None,
717 1 => {
718 self.cur += 1;
719 Some(self.reader.read_u8().map(NumericValue::from))
720 }
721 2 => {
722 self.cur += 1;
723 Some(
724 self.reader
725 .read_u16::<LittleEndian>()
726 .map(NumericValue::from),
727 )
728 }
729 3 => {
730 self.cur += 1;
731 Some(
732 self.reader
733 .read_u32::<LittleEndian>()
734 .map(NumericValue::from),
735 )
736 }
737 5 => {
738 self.cur += 1;
739 let val_res = match self
740 .reader
741 .read_u32::<LittleEndian>()
742 .map(NumericValue::from)
743 {
744 Ok(nv) => match nv.as_f32() {
745 Ok(nv) => Ok(nv),
746 Err(_e) => Err(std::io::Error::new(
747 std::io::ErrorKind::Other,
748 Error::NumericaValueAsF32Error,
749 )),
750 },
751 Err(e) => Err(e),
752 };
753
754 Some(val_res)
755 }
756 _ => panic!(),
757 }
758 }
759 }
760}
761
762/// Generate an iterator of numbers from a continuous bytes buffer
763/// - typ: data type byte
764/// - n: total number of elements to iterate
765/// - buffer: the bytes buffer
766pub fn iter_typed_integers(typ: u8, n: usize, buffer: &[u8]) -> NumericValueIter {
767 NumericValueIter {
768 reader: std::io::Cursor::new(buffer),
769 typ,
770 len: n,
771 cur: 0,
772 }
773}
774
775/// Read a typed string from the reader to a Rust String
776pub fn read_typed_string<R>(reader: &mut R, buffer: &mut Vec<u8>) -> std::io::Result<usize>
777where
778 R: std::io::Read + ReadBytesExt,
779{
780 let (typ, n) = read_typed_descriptor_bytes(reader)?;
781 assert_eq!(typ, 0x7);
782 let s = buffer.len();
783 buffer.resize(s + n, b'\0');
784 reader.read_exact(&mut buffer.as_mut_slice()[s..s + n])?;
785 Ok(n)
786}
787
788/// read the header lines to a String
789/// use Header::from_string(text) to convert the string into structured data
790pub fn read_header<R>(reader: &mut R) -> Result<String>
791where
792 R: std::io::Read + ReadBytesExt,
793{
794 // read magic
795 let mut magic = [0u8; 3];
796 reader.read_exact(&mut magic)?;
797 assert_eq!(&magic, b"BCF");
798
799 // read major verion and minor version
800 let major = reader.read_u8()?;
801 let minor = reader.read_u8()?;
802 assert_eq!(major, 2);
803 assert_eq!(minor, 2);
804
805 // read text length
806 let l_length = reader.read_u32::<LittleEndian>()?;
807 let mut text = vec![0u8; l_length as usize];
808 reader.read_exact(&mut text)?;
809
810 String::from_utf8(text).map_err(Error::FromUtf8Error)
811}
812
813/// Represents a record (a line or a site) in BCF file
814#[derive(Default, Debug)]
815pub struct Record {
816 buf_shared: Vec<u8>,
817 buf_indiv: Vec<u8>,
818 chrom: i32,
819 pos: i32,
820 rlen: i32,
821 qual: NumericValue,
822 n_info: u16,
823 n_allele: u16,
824 n_sample: u32,
825 n_fmt: u8,
826 id: Range<usize>,
827 alleles: Vec<Range<usize>>,
828 /// (typ, n, byte_range)
829 filters: (u8, usize, Range<usize>),
830 /// (info_key, typ, n, byte_range)
831 info: Vec<(usize, u8, usize, Range<usize>)>,
832 /// (fmt_key, typ, n, byte_range)
833 gt: Vec<(usize, u8, usize, Range<usize>)>,
834}
835impl Record {
836 /// read a record (copy bytes from the reader to the record's interval
837 /// buffers), and separate fields
838 pub fn read<R>(&mut self, reader: &mut R) -> Result<()>
839 where
840 R: std::io::Read + ReadBytesExt,
841 {
842 let l_shared = match reader.read_u32::<LittleEndian>() {
843 Ok(x) => x,
844 Err(_x) => Err(_x)?,
845 };
846 let l_indv = reader.read_u32::<LittleEndian>()?;
847 // dbg!(l_shared, l_indv);
848 self.buf_shared.resize(l_shared as usize, 0u8);
849 self.buf_indiv.resize(l_indv as usize, 0u8);
850 reader.read_exact(self.buf_shared.as_mut_slice())?;
851 reader.read_exact(self.buf_indiv.as_mut_slice())?;
852 self.parse_shared()?;
853 self.parse_indv()?;
854 // dbg!(self.pos);
855 Ok(())
856 }
857 /// parse shared fields
858 fn parse_shared(&mut self) -> std::io::Result<()> {
859 let mut reader = std::io::Cursor::new(self.buf_shared.as_slice());
860 self.chrom = reader.read_i32::<LittleEndian>()?;
861 self.pos = reader.read_i32::<LittleEndian>()?;
862 self.rlen = reader.read_i32::<LittleEndian>()?;
863 let qual_u32 = reader.read_u32::<LittleEndian>()?;
864 self.qual = NumericValue::from(qual_u32)
865 .as_f32()
866 .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
867 self.n_info = reader.read_u16::<LittleEndian>()?;
868 self.n_allele = reader.read_u16::<LittleEndian>()?;
869 let combined = reader.read_u32::<LittleEndian>()?;
870 self.n_sample = combined & 0xffffff;
871 self.n_fmt = (combined >> 24) as u8;
872 // id
873 let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
874 assert_eq!(typ, 0x7);
875 let cur = reader.position() as usize;
876 self.id = cur..cur + n;
877 reader.seek(std::io::SeekFrom::Current(n as i64))?;
878 // alleles
879 self.alleles.clear();
880 for _ in 0..self.n_allele {
881 let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
882 assert_eq!(typ, 0x7);
883 let cur = reader.position() as usize;
884 self.alleles.push(cur..cur + n);
885 reader.seek(std::io::SeekFrom::Current(n as i64))?;
886 }
887 //filters
888 let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
889 let width: usize = bcf2_typ_width(typ);
890 let s = reader.position() as usize;
891 let e = s + width * n;
892 reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
893 self.filters = (typ, n, s..e);
894 // infos
895 self.info.clear();
896 for _idx in 0..(self.n_info as usize) {
897 let info_key = read_single_typed_integer(&mut reader)?;
898 let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
899 let width = bcf2_typ_width(typ);
900 let s = reader.position() as usize;
901 let e = s + width * n;
902 reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
903 self.info.push((info_key as usize, typ, n, s..e));
904 }
905 Ok(())
906 }
907 /// parse indiv fields, complicated field will need further processing
908 fn parse_indv(&mut self) -> std::io::Result<()> {
909 let mut reader = std::io::Cursor::new(self.buf_indiv.as_slice());
910 self.gt.clear();
911 for _idx in 0..(self.n_fmt as usize) {
912 let fmt_key = read_single_typed_integer(&mut reader)?;
913 let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
914 let width = bcf2_typ_width(typ);
915 let s = reader.position() as usize;
916 let e = s + width * self.n_sample as usize * n;
917 reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
918 self.gt.push((fmt_key as usize, typ, n, s..e));
919 }
920 Ok(())
921 }
922
923 /// get chromosome offset
924 /// Example:
925 /// ```
926 /// use bcf_reader::*;
927 /// use std::io::Write;
928 /// // read data generated by bcftools
929 /// // bcftools query -f '%CHROM\n' test.bcf | bgzip -c > test_chrom.gz
930 /// let mut chrom_str = String::new();
931 /// smart_reader("testdata/test_chrom.gz")
932 /// .unwrap()
933 /// .read_to_string(&mut chrom_str)
934 /// .unwrap();
935 /// // read data via bcf-reader
936 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
937 /// let s = read_header(&mut f).unwrap();
938 /// let header = Header::from_string(&s).unwrap();
939 /// let mut record = Record::default();
940 /// let mut chrom_str2 = Vec::<u8>::new();
941 /// while let Ok(_) = record.read(&mut f) {
942 /// write!(
943 /// chrom_str2,
944 /// "{}\n",
945 /// header.get_chrname(record.chrom() as usize)
946 /// )
947 /// .unwrap();
948 /// }
949 /// let chrom_str2 = String::from_utf8(chrom_str2).unwrap();
950 /// // compare bcftools results and bcf-reader results
951 /// assert_eq!(chrom_str, chrom_str2);
952 /// ```
953 pub fn chrom(&self) -> i32 {
954 self.chrom
955 }
956
957 /// Returns the reference length of the record.
958 pub fn rlen(&self) -> i32 {
959 self.rlen
960 }
961
962 /// Returns the quality score of the record, if available
963 pub fn qual(&self) -> Option<f32> {
964 self.qual.float_val()
965 }
966
967 pub fn n_allele(&self) -> u16 {
968 self.n_allele
969 }
970
971 /// Returns an iterator over the genotype values in the record's FORMAT field.
972 /// If no FORMAT/GT field available, the returned iterator will have items.
973 /// Example:
974 /// ```
975 /// use bcf_reader::*;
976 /// use std::io::Write;
977 /// // read data generated by bcftools
978 /// // bcftools query -f '[\t%GT]\n' test.bcf | bgzip -c > test_gt.gz
979 /// let mut gt_str = String::new();
980 /// smart_reader("testdata/test_gt.gz")
981 /// .unwrap()
982 /// .read_to_string(&mut gt_str)
983 /// .unwrap();
984 /// // read data via bcf-reader
985 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
986 /// let s = read_header(&mut f).unwrap();
987 /// let header = Header::from_string(&s).unwrap();
988 /// let mut record = Record::default();
989 /// let mut gt_str2 = Vec::<u8>::new();
990 /// while let Ok(_) = record.read(&mut f) {
991 /// for (i, bn) in record.fmt_gt(&header).enumerate() {
992 /// let (noploidy, dot, phased, allele) = bn.unwrap().gt_val();
993 /// assert_eq!(noploidy, false); // missing ploidy
994 /// let mut sep = '\t';
995 /// if i % 2 == 1 {
996 /// if phased {
997 /// sep = '|';
998 /// } else {
999 /// sep = '/';
1000 /// }
1001 /// }
1002 /// if dot {
1003 /// write!(gt_str2, "{sep}.").unwrap();
1004 /// } else {
1005 /// write!(gt_str2, "{sep}{allele}").unwrap();
1006 /// }
1007 /// }
1008 /// write!(gt_str2, "\n").unwrap();
1009 /// }
1010 /// let gt_str2 = String::from_utf8(gt_str2).unwrap();
1011 /// // compare bcftools results and bcf-reader results
1012 /// for (a, b) in gt_str
1013 /// .split(|c| (c == '\n') || (c == '\t'))
1014 /// .zip(gt_str2.split(|c| (c == '\n') || (c == '\t')))
1015 /// {
1016 /// assert_eq!(a, b);
1017 /// }
1018 /// ```
1019 pub fn fmt_gt(&self, header: &Header) -> NumericValueIter<'_> {
1020 let mut it = NumericValueIter::default();
1021 match header.get_fmt_gt_id() {
1022 None => it,
1023 Some(fmt_gt_id) => {
1024 // find the right field for gt
1025 self.gt.iter().for_each(|e| {
1026 if e.0 == fmt_gt_id {
1027 it = iter_typed_integers(
1028 e.1,
1029 e.2 * self.n_sample as usize,
1030 &self.buf_indiv[e.3.start..e.3.end],
1031 );
1032 }
1033 });
1034 it
1035 }
1036 }
1037 }
1038
1039 /// Returns an iterator over all values for a field in the record's FORMATs (indiv).
1040 ///
1041 /// Example:
1042 /// ```
1043 /// use bcf_reader::*;
1044 /// use std::io::Write;
1045 /// // read data generated by bcftools
1046 /// // bcftools query -f '[\t%AD]\n' test.bcf | bgzip -c > test_ad.gz
1047 /// let mut ad_str = String::new();
1048 /// smart_reader("testdata/test_ad.gz")
1049 /// .unwrap()
1050 /// .read_to_string(&mut ad_str)
1051 /// .unwrap();
1052 /// // read data via bcf-reader
1053 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1054 /// let s = read_header(&mut f).unwrap();
1055 /// let header = Header::from_string(&s).unwrap();
1056 /// let mut record = Record::default();
1057 /// let mut ad_str2 = Vec::<u8>::new();
1058 /// let ad_filed_key = header.get_idx_from_dictionary_str("FORMAT", "AD").unwrap();
1059 /// while let Ok(_) = record.read(&mut f) {
1060 /// for (i, val) in record.fmt_field(ad_filed_key).enumerate() {
1061 /// let val = val.unwrap();
1062 /// if i % record.n_allele() as usize == 0 {
1063 /// if ad_str2.last().map(|c| *c == b',') == Some(true) {
1064 /// ad_str2.pop(); // trim last allele separator
1065 /// }
1066 /// ad_str2.push(b'\t'); // sample separator
1067 /// }
1068 /// match val.int_val() {
1069 /// None => {}
1070 /// Some(ad) => {
1071 /// write!(ad_str2, "{ad},").unwrap(); // allele separator
1072 /// }
1073 /// }
1074 /// }
1075 /// // site separator
1076 /// *ad_str2.last_mut().unwrap() = b'\n'; // sample separator
1077 /// }
1078 /// let ad_str2 = String::from_utf8(ad_str2).unwrap();
1079 /// // compare bcftools results and bcf-reader results
1080 /// for (a, b) in ad_str
1081 /// .split(|c| (c == '\n') || (c == '\t'))
1082 /// .zip(ad_str2.split(|c| (c == '\n') || (c == '\t')))
1083 /// {
1084 /// assert_eq!(a, b);
1085 /// }
1086 /// ```
1087 pub fn fmt_field(&self, fmt_key: usize) -> NumericValueIter<'_> {
1088 // default iterator
1089 let mut it = NumericValueIter::default();
1090
1091 // find the right field for gt
1092 self.gt.iter().for_each(|e| {
1093 if e.0 == fmt_key {
1094 it = iter_typed_integers(
1095 e.1,
1096 e.2 * self.n_sample as usize,
1097 &self.buf_indiv[e.3.start..e.3.end],
1098 );
1099 }
1100 });
1101 it
1102 }
1103
1104 /// get 0-based position (bp) value
1105 /// Example:
1106 /// ```
1107 /// use bcf_reader::*;
1108 /// // read data generated by bcftools
1109 /// // bcftools query -f '%POS\n' test.bcf | bgzip -c > test_pos.gz
1110 /// let mut pos_str = String::new();
1111 /// smart_reader("testdata/test_pos.gz")
1112 /// .unwrap()
1113 /// .read_to_string(&mut pos_str)
1114 /// .unwrap();
1115 /// // read data via bcf-reader
1116 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1117 /// let _s = read_header(&mut f).unwrap();
1118 /// let mut record = Record::default();
1119 /// let mut pos_str2 = Vec::<u8>::new();
1120 /// use std::io::Write;
1121 /// while let Ok(_) = record.read(&mut f) {
1122 /// write!(pos_str2, "{}\n", record.pos() + 1).unwrap();
1123 /// }
1124 /// let pos_str2 = String::from_utf8(pos_str2).unwrap();
1125 /// // compare bcftools results and bcf-reader results
1126 /// assert_eq!(pos_str, pos_str2);
1127 /// ```
1128 pub fn pos(&self) -> i32 {
1129 self.pos
1130 }
1131
1132 /// get variant ID as &str
1133 /// Example:
1134 /// ```
1135 /// use bcf_reader::*;
1136 /// // read data generated by bcftools
1137 /// // bcftools query -f '%ID\n' test.bcf | bgzip -c > test_id.gz
1138 /// let mut id_str = String::new();
1139 ///
1140 /// smart_reader("testdata/test_id.gz")
1141 /// .unwrap()
1142 /// .read_to_string(&mut id_str)
1143 /// .unwrap();
1144 ///
1145 /// // read data via bcf-reader
1146 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1147 /// let _s = read_header(&mut f).unwrap();
1148 /// let mut record = Record::default();
1149 /// let mut id_str2 = Vec::<u8>::new();
1150 /// use std::io::Write;
1151 /// while let Ok(_) = record.read(&mut f) {
1152 /// let record_id = record.id().unwrap();
1153 /// let id = if record_id.is_empty() { "." } else { record_id };
1154 /// write!(id_str2, "{}\n", id).unwrap();
1155 /// }
1156 /// let id_str2 = String::from_utf8(id_str2).unwrap();
1157 /// // compare bcftools results and bcf-reader results
1158 /// assert_eq!(id_str, id_str2);
1159 /// ```
1160 pub fn id(&self) -> Result<&str> {
1161 std::str::from_utf8(&self.buf_shared[self.id.start..self.id.end]).map_err(Error::Utf8Error)
1162 }
1163
1164 /// Returns the ranges of bytes in buf_shared for all alleles in the record.
1165 /// Example:
1166 /// ```
1167 /// use bcf_reader::*;
1168 /// // read data generated by bcftools
1169 /// // bcftools query -f '%REF,%ALT\n' test.bcf | bgzip -c > test_allele.gz
1170 /// let mut allele_str = String::new();
1171 /// smart_reader("testdata/test_allele.gz")
1172 /// .unwrap()
1173 /// .read_to_string(&mut allele_str)
1174 /// .unwrap();
1175 /// // read data via bcf-reader
1176 /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1177 /// let _s = read_header(&mut f).unwrap();
1178 /// let mut record = Record::default();
1179 /// let mut allele_str2 = Vec::<u8>::new();
1180 /// while let Ok(_) = record.read(&mut f) {
1181 /// for rng in record.alleles().iter() {
1182 /// let slice = &record.buf_shared()[rng.clone()];
1183 /// allele_str2.extend(slice);
1184 /// allele_str2.push(b',');
1185 /// }
1186 /// *allele_str2.last_mut().unwrap() = b'\n';
1187 /// }
1188 /// let allele_str2 = String::from_utf8(allele_str2).unwrap();
1189 /// // compare bcftools results and bcf-reader results
1190 /// assert_eq!(allele_str, allele_str2);
1191 /// ```
1192 pub fn alleles(&self) -> &[Range<usize>] {
1193 &self.alleles[..]
1194 }
1195
1196 /// Return an iterator of numeric values for an INFO/xxx field.
1197 /// If the key is not found, the returned iterator will have a zero length.
1198 ///
1199 /// Example:
1200 /// ```
1201 /// use bcf_reader::*;
1202 /// use std::io::Write;
1203 /// // read data generated by bcftools
1204 /// // bcftools query -f '%INFO/AF\n' testdata/test2.bcf | bgzip -c > testdata/test2_info_af.gz
1205 /// let mut info_af_str = String::new();
1206 /// smart_reader("testdata/test2_info_af.gz")
1207 /// .unwrap()
1208 /// .read_to_string(&mut info_af_str)
1209 /// .unwrap();
1210 /// // read data via bcf-reader
1211 /// let mut f = smart_reader("testdata/test2.bcf").unwrap();
1212 /// let s = read_header(&mut f).unwrap();
1213 /// let header = Header::from_string(&s).unwrap();
1214 /// let mut record = Record::default();
1215 /// let mut info_af_str2 = Vec::<u8>::new();
1216 /// let info_af_key = header.get_idx_from_dictionary_str("INFO", "AF").unwrap();
1217 /// while let Ok(_) = record.read(&mut f) {
1218 /// record.info_field_numeric(info_af_key).for_each(|nv| {
1219 /// let af = nv.unwrap().float_val().unwrap();
1220 /// write!(info_af_str2, "{af},").unwrap();
1221 /// });
1222 /// *info_af_str2.last_mut().unwrap() = b'\n'; // line separators
1223 /// }
1224 /// let filter_str2 = String::from_utf8(info_af_str2).unwrap();
1225 /// assert_eq!(info_af_str, filter_str2);
1226 /// ```
1227 pub fn info_field_numeric(&self, info_key: usize) -> NumericValueIter {
1228 // default
1229 let mut it = NumericValueIter {
1230 reader: std::io::Cursor::new(&[0u8; 0]),
1231 typ: 0,
1232 len: 0,
1233 cur: 0,
1234 };
1235 for (key, typ, n, rng) in self.info.iter() {
1236 if *key == info_key {
1237 it = NumericValueIter {
1238 reader: std::io::Cursor::new(&self.buf_shared[rng.start..rng.end]),
1239 typ: *typ,
1240 len: *n,
1241 cur: 0,
1242 };
1243 break;
1244 }
1245 }
1246 it
1247 }
1248
1249 /// Return str value for an INFO/xxx field.
1250 /// If the key is not found or data type is not string, then return None.
1251 pub fn info_field_str(&self, info_key: usize) -> Result<Option<&str>> {
1252 let mut res = None;
1253 for (key, typ, _n, rng) in self.info.iter() {
1254 if *key == info_key {
1255 if *typ != 0x7 {
1256 return Ok(None);
1257 }
1258 let s = std::str::from_utf8(&self.buf_shared[rng.start..rng.end])
1259 .map_err(Error::Utf8Error)?;
1260 res = Some(s);
1261 break;
1262 }
1263 }
1264 Ok(res)
1265 }
1266
1267 /// iterate an integer for each filter key.
1268 /// If the length of the iterator is 0, it means no filter label is set.
1269 /// Example:
1270 /// ```
1271 /// use bcf_reader::*;
1272 /// use std::io::Write;
1273 /// // read data generated by bcftools
1274 /// // bcftools query -f '%FILTER\n' testdata/test2.bcf | bgzip -c > testdata/test2_filters.gz
1275 /// let mut filter_str = String::new();
1276 /// smart_reader("testdata/test2_filters.gz")
1277 /// .unwrap()
1278 /// .read_to_string(&mut filter_str)
1279 /// .unwrap();
1280 /// // read data via bcf-reader
1281 /// let mut f = smart_reader("testdata/test2.bcf").unwrap();
1282 /// let s = read_header(&mut f).unwrap();
1283 /// let header = Header::from_string(&s).unwrap();
1284 /// let mut record = Record::default();
1285 /// let mut filter_str2 = Vec::<u8>::new();
1286 /// let d = header.dict_strings();
1287 /// while let Ok(_) = record.read(&mut f) {
1288 /// record.filters().for_each(|nv| {
1289 /// let nv = nv.unwrap();
1290 /// let filter_key = nv.int_val().unwrap() as usize;
1291 /// let dict_string_map = &d[&filter_key];
1292 /// let filter_name = &dict_string_map["ID"];
1293 /// write!(filter_str2, "{filter_name};").unwrap();
1294 /// });
1295 /// *filter_str2.last_mut().unwrap() = b'\n'; // line separators
1296 /// }
1297 /// let filter_str2 = String::from_utf8(filter_str2).unwrap();
1298 /// // compare bcftools results and bcf-reader results
1299 /// assert_eq!(filter_str, filter_str2);
1300 /// ```
1301 pub fn filters(&self) -> NumericValueIter {
1302 let (typ, n, rng) = &self.filters;
1303 NumericValueIter {
1304 reader: std::io::Cursor::new(&self.buf_shared[rng.start..rng.end]),
1305 typ: *typ,
1306 len: *n,
1307 cur: 0,
1308 }
1309 }
1310
1311 /// Returns the buffer containing indv (sample-level) information
1312 pub fn buf_indiv(&self) -> &[u8] {
1313 &self.buf_indiv[..]
1314 }
1315
1316 /// Returns the buffer containing the shared (site-level) information
1317 pub fn buf_shared(&self) -> &[u8] {
1318 &self.buf_shared[..]
1319 }
1320}
1321
1322/// Open a file from a path as a MultiGzDecoder or a BufReader depending on
1323/// whether the file has the magic number for gzip (0x1f and 0x8b)
1324pub fn smart_reader(p: impl AsRef<std::path::Path>) -> std::io::Result<Box<dyn std::io::Read>> {
1325 let mut f = std::fs::File::open(p.as_ref())?;
1326 if (f.read_u8()? == 0x1fu8) && (f.read_u8()? == 0x8bu8)
1327 //.expect("can not read second byte")
1328 {
1329 // gzip format
1330 f.rewind()?;
1331 Ok(Box::new(flate2::read::MultiGzDecoder::new(f)))
1332 } else {
1333 // not gzip format
1334 f.rewind()?;
1335 Ok(Box::new(std::io::BufReader::new(f)))
1336 }
1337}
1338
1339/// This reader facilitates parallel decompression of BCF data compressed in
1340/// the BGZF format—a specialized version of the multi-member gzip file format.
1341/// It utilizes internal buffers to sequentially ingest compressed data from
1342/// various gzip blocks, leveraging the `rayon` crate to achieve concurrent
1343/// decompression. This design addresses the potential bottleneck in data
1344/// processing speed that occurs when decompression is not executed in parallel,
1345/// ensuring more efficient handling of compressed data streams.
1346/// Example:
1347/// ```
1348/// use bcf_reader::*;
1349/// use std::fs::File;
1350/// use std::io::BufReader;
1351/// use std::io::Write;
1352/// // read data generated by bcftools
1353/// // bcftools query -f '[\t%GT]\n' test.bcf | bgzip -c > test_gt.gz
1354/// let mut gt_str = String::new();
1355/// smart_reader("testdata/test_gt.gz")
1356/// .unwrap()
1357/// .read_to_string(&mut gt_str)
1358/// .unwrap();
1359/// // read data via bcf-reader
1360/// let max_gzip_block_in_buffer = 10;
1361/// let reader = File::open("testdata/test.bcf").map(BufReader::new).unwrap();
1362/// let mut f =
1363/// ParMultiGzipReader::from_reader(reader, max_gzip_block_in_buffer, None, None).unwrap();
1364/// let s = read_header(&mut f).unwrap();
1365/// let header = Header::from_string(&s).unwrap();
1366/// let mut record = Record::default();
1367/// let mut gt_str2 = Vec::<u8>::new();
1368/// while let Ok(_) = record.read(&mut f) {
1369/// for (i, bn) in record.fmt_gt(&header).enumerate() {
1370/// let bn = bn.unwrap();
1371/// let (noploidy, dot, phased, allele) = bn.gt_val();
1372/// assert_eq!(noploidy, false); // missing ploidy
1373/// let mut sep = '\t';
1374/// if i % 2 == 1 {
1375/// if phased {
1376/// sep = '|';
1377/// } else {
1378/// sep = '/';
1379/// }
1380/// }
1381/// if dot {
1382/// write!(gt_str2, "{sep}.").unwrap();
1383/// } else {
1384/// write!(gt_str2, "{sep}{allele}").unwrap();
1385/// }
1386/// }
1387/// write!(gt_str2, "\n").unwrap();
1388/// }
1389/// let gt_str2 = String::from_utf8(gt_str2).unwrap();
1390/// // compare bcftools results and bcf-reader results
1391/// for (a, b) in gt_str
1392/// .split(|c| (c == '\n') || (c == '\t'))
1393/// .zip(gt_str2.split(|c| (c == '\n') || (c == '\t')))
1394/// {
1395/// assert_eq!(a, b);
1396/// }
1397/// ```
1398///
1399/// See [`ParMultiGzipReader::from_reader`] for an example to jump to a target
1400/// genome interval.
1401pub struct ParMultiGzipReader<R>
1402where
1403 R: Read,
1404{
1405 inner: R,
1406 // vector of pairs of vector; For each pair one is for compressed data, the other for uncompressed
1407 buffer: Vec<BgzfBuffer>,
1408 ngzip: usize, // number gzip blocks read into buffer
1409 igzip: usize, // the current block to be consumed
1410 ibyte: usize, // the current byte to be consumed
1411 coffset: u64,
1412 inner_eof: bool,
1413}
1414
1415#[derive(Default, Clone)]
1416struct BgzfBuffer {
1417 compressed: Vec<u8>,
1418 uncompressed: Vec<u8>,
1419 coffset: u64, // inclusive
1420 gzip_size: u16,
1421 uncompressed_data_size: u32,
1422}
1423
1424impl<R> ParMultiGzipReader<R>
1425where
1426 R: Read,
1427{
1428 /// Constructs a new `ParMultiGzipReader` by specifying the `ngzip_max` parameter,
1429 /// which defines the maximum number of gzip blocks that the internal buffers can
1430 /// handle simultaneously. This parameter should ideally be set to the number of
1431 /// CPU cores available to optimize parallel decompression performance, thereby
1432 /// leveraging the hardware's concurrency capabilities.
1433 ///
1434 /// The `coffset` parameter indicates the offset to the first byte of a
1435 /// target gzip block of the input reader. The input reader should point to
1436 /// the start byte of a gzip block as indicated by `coffset` before passing
1437 /// the reader to `ParMultiGzipReader::from_reader`; otherwise,
1438 /// [`Seek::seek`] should be used on the input reader to adjust the position
1439 /// accordingly. Note that `ParMultiGzipReader` does not call [`Seek::seek`]
1440 /// on the reader.
1441 ///
1442 /// The `uoffset` parameter specifies the number of bytes to skip within the
1443 /// first decompressed gzip data. Since skipping within uncompressed data
1444 /// requires decompression, this offset is applied within the
1445 /// `ParMultiGzipReader::from_reader` method.
1446 ///
1447 /// # Examples
1448 /// ```
1449 /// use bcf_reader::*;
1450 /// use std::{
1451 /// fs::File,
1452 /// io::{BufReader, Seek},
1453 /// };
1454 /// // index file
1455 /// let csi = Csi::from_path("testdata/test3.bcf.csi").unwrap();
1456 /// // reader
1457 /// let mut reader = File::open("testdata/test3.bcf")
1458 /// .map(BufReader::new)
1459 /// .unwrap();
1460 ///
1461 /// // calculate first offsets of the target postion
1462 /// let start = 1495403 - 1;
1463 /// let end = 1495746 - 1;
1464 /// let chrom_id = 0;
1465 /// let bin_id = csi.get_bin_id(start, start + 1 as i64);
1466 /// let bin_details = csi.get_bin_details(chrom_id, bin_id).unwrap();
1467 /// let (coffset, uoffset) = bin_details.chunks()[0].chunk_beg.get_coffset_uoffset();
1468 ///
1469 /// // seek to the target bgzip block
1470 /// reader.seek(std::io::SeekFrom::Start(coffset)).unwrap();
1471 /// // create the parallelizable reader by wraping around the existing reader
1472 /// // and specifing offsets
1473 /// let mut reader =
1474 /// ParMultiGzipReader::from_reader(reader, 1, Some(coffset), Some(uoffset)).unwrap();
1475 ///
1476 /// let mut record = Record::default();
1477 /// let mut pos_found = vec![];
1478 /// while let Ok(_) = record.read(&mut reader) {
1479 /// let pos = record.pos() as i64;
1480 /// // the bin containing the start position of target interval may have records
1481 /// // before the start position, so skip them.
1482 /// if pos < start {
1483 /// continue;
1484 /// }
1485 /// // read the record until out of the target interval
1486 /// else if pos >= end {
1487 /// break;
1488 /// }
1489 /// pos_found.push(pos);
1490 /// }
1491 /// assert_eq!(pos_found, vec![start]);
1492 /// ```
1493 ///
1494 /// # Parameters
1495 /// - `reader`: The input reader from which gzip blocks will be read.
1496 /// - `ngzip_max`: The maximum number of gzip blocks that can be processed in parallel.
1497 /// - `coffset`: An optional offset to the start of a gzip block in the input reader.
1498 /// - `uoffset`: An optional offset within the first block of decompressed data.
1499 ///
1500 /// # Returns
1501 /// Returns a new instance of `ParMultiGzipReader`.
1502 pub fn from_reader(
1503 reader: R,
1504 ngzip_max: usize,
1505 coffset: Option<u64>,
1506 uoffset: Option<u64>,
1507 ) -> Result<Self> {
1508 let mut this = Self {
1509 inner: reader,
1510 buffer: vec![BgzfBuffer::default(); ngzip_max],
1511 ngzip: 0,
1512 igzip: 0,
1513 ibyte: 0,
1514 coffset: coffset.unwrap_or(0),
1515 inner_eof: false,
1516 };
1517 this.clear_and_fill_buffers()?;
1518 this.decomp_all()?;
1519 this.ibyte = uoffset.unwrap_or(0) as usize;
1520 Ok(this)
1521 }
1522 pub fn get_coffset_uoffset(&self) -> (u64, u64) {
1523 let coffset = self.buffer[self.igzip].coffset;
1524 let uoffset = self.ibyte as u64;
1525 (coffset, uoffset)
1526 }
1527 fn read_single_gzip(&mut self) -> io::Result<()> {
1528 let this_buffer_offset = if self.ngzip == 0 {
1529 self.coffset
1530 } else {
1531 let prev_buffer = &self.buffer[self.ngzip - 1];
1532 prev_buffer.coffset + prev_buffer.gzip_size as u64
1533 };
1534 let this_buffer = &mut self.buffer[self.ngzip];
1535
1536 // dbg!(this_buffer_offset);
1537
1538 // let coffset_beg = self.inner.stream_position().unwrap();
1539 // dbg!(coffset_beg);
1540 let id1 = match self.inner.read_u8() {
1541 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
1542 self.inner_eof = true;
1543 return Ok(());
1544 }
1545 Err(e) => {
1546 return Err(e);
1547 }
1548 Ok(id1) => id1,
1549 };
1550 if id1 != 31 {
1551 Err(ParseGzipHeaderError {
1552 key: "id1",
1553 expected: 31,
1554 found: id1 as usize,
1555 })?;
1556 }
1557
1558 let id2 = self.inner.read_u8()?;
1559 if id2 != 139 {
1560 Err(ParseGzipHeaderError {
1561 key: "id2",
1562 expected: 139,
1563 found: id2 as usize,
1564 })?;
1565 }
1566 // assert_eq!(id2, 139);
1567 let cm = self.inner.read_u8()?;
1568 if cm != 8 {
1569 Err(ParseGzipHeaderError {
1570 key: "cm",
1571 expected: 8,
1572 found: cm as usize,
1573 })?;
1574 }
1575 // assert_eq!(cm, 8);
1576 let flg = self.inner.read_u8()?;
1577 if flg != 4 {
1578 Err(ParseGzipHeaderError {
1579 key: "flg",
1580 expected: 4,
1581 found: flg as usize,
1582 })?;
1583 }
1584 // assert_eq!(flg, 4);
1585 let _mtime = self.inner.read_u32::<LittleEndian>()?;
1586 let _xfl = self.inner.read_u8()?;
1587 let _os = self.inner.read_u8()?;
1588 let xlen = self.inner.read_u16::<LittleEndian>()?;
1589 let si1 = self.inner.read_u8()?;
1590 assert_eq!(si1, 66);
1591 let si2 = self.inner.read_u8()?;
1592 assert_eq!(si2, 67);
1593 let slen = self.inner.read_u16::<LittleEndian>()?;
1594 assert_eq!(slen, 2);
1595 let bsize = self.inner.read_u16::<LittleEndian>()?;
1596
1597 let buffer_compressed = &mut this_buffer.compressed;
1598 let cdata_sz = bsize - xlen - 19;
1599
1600 buffer_compressed.resize(cdata_sz as usize, 0u8);
1601 self.inner.read_exact(buffer_compressed.as_mut_slice())?;
1602
1603 let _crc32 = self.inner.read_u32::<LittleEndian>()?;
1604 let isize = self.inner.read_u32::<LittleEndian>()?;
1605
1606 let buffer_uncompressed = &mut this_buffer.uncompressed;
1607 buffer_uncompressed.resize(isize as usize, 0u8);
1608 this_buffer.coffset = this_buffer_offset;
1609 this_buffer.gzip_size = bsize + 1;
1610 this_buffer.uncompressed_data_size = isize;
1611
1612 // increment counter
1613 self.ngzip += 1;
1614
1615 Ok(())
1616 }
1617 // clear buffer and refill (sequential)
1618 fn clear_and_fill_buffers(&mut self) -> std::io::Result<()> {
1619 let Self {
1620 inner: _,
1621 buffer,
1622 coffset,
1623 ngzip,
1624 igzip,
1625 ibyte,
1626 inner_eof: _,
1627 } = self;
1628
1629 // update coffset for the buffer vector based on last used buffer
1630 if *ngzip > 0 {
1631 let last_buffer = &buffer[*ngzip - 1];
1632 *coffset = last_buffer.coffset + last_buffer.gzip_size as u64;
1633 }
1634
1635 buffer.iter_mut().for_each(|bgzf_buffer| {
1636 bgzf_buffer.compressed.clear();
1637 bgzf_buffer.uncompressed.clear();
1638 bgzf_buffer.coffset = 0;
1639 bgzf_buffer.gzip_size = 0;
1640 bgzf_buffer.uncompressed_data_size = 0;
1641 });
1642 *ngzip = 0;
1643 *igzip = 0;
1644 *ibyte = 0;
1645 for _i in 0..self.buffer.len() {
1646 self.read_single_gzip()?;
1647 if self.inner_eof {
1648 break;
1649 }
1650 }
1651 Ok(())
1652 }
1653
1654 /// decompress all read gzip file in memory (parallel)
1655 fn decomp_all(&mut self) -> std::io::Result<()> {
1656 self.buffer
1657 .par_iter_mut()
1658 .try_for_each(|buffer| -> std::io::Result<()> {
1659 let compressed = buffer.compressed.as_slice();
1660 let uncompressed = &mut buffer.uncompressed.as_mut_slice();
1661 let mut deflater = DeflateDecoder::new(compressed);
1662 deflater.read_exact(uncompressed)?;
1663 //.map_err(|e| e.add_context("deflater.read_exact"))?;
1664 Ok(())
1665 })?;
1666 Ok(())
1667 }
1668}
1669
1670impl<R> Read for ParMultiGzipReader<R>
1671where
1672 R: Read,
1673{
1674 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1675 // no more data in the buffer
1676 if self.ngzip == self.igzip {
1677 // no more data to read from file
1678 if self.inner_eof {
1679 return Ok(0);
1680 }
1681 self.clear_and_fill_buffers()?;
1682 self.decomp_all()?;
1683 }
1684 // read from the buffer
1685 // check current
1686 let uncompressed = self.buffer[self.igzip].uncompressed.as_slice();
1687 let mut n = uncompressed.len() - self.ibyte;
1688 if n > buf.len() {
1689 n = buf.len();
1690 }
1691 // copy data
1692 buf[0..n]
1693 .iter_mut()
1694 .zip(uncompressed[self.ibyte..].iter())
1695 .for_each(|(d, s)| *d = *s);
1696 self.ibyte += n;
1697 if self.ibyte == uncompressed.len() {
1698 self.igzip += 1;
1699 self.ibyte = 0;
1700 if self.ngzip == self.igzip {
1701 // no more data to read from file
1702 if self.inner_eof {
1703 return Ok(0);
1704 }
1705 self.clear_and_fill_buffers()?;
1706 self.decomp_all()?;
1707 }
1708 }
1709 Ok(n)
1710 }
1711}
1712
1713/// Virutal File offset used to jump to specific indexed bin within BCF-format
1714/// genotype data separated into BGZF blocks
1715#[derive(Default)]
1716pub struct VirtualFileOffsets(u64);
1717
1718impl VirtualFileOffsets {
1719 /// Get the `coffset` and `uoffset` tuple from the virutalfileoffset
1720 pub fn get_coffset_uoffset(&self) -> (u64, u64) {
1721 (self.0 >> 16, self.0 & 0xffff)
1722 }
1723}
1724
1725impl From<u64> for VirtualFileOffsets {
1726 /// Convert u64 into `VirtualFileOffsets`
1727 fn from(value: u64) -> Self {
1728 VirtualFileOffsets(value)
1729 }
1730}
1731impl Debug for VirtualFileOffsets {
1732 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1733 let (coffset, uoffset) = self.get_coffset_uoffset();
1734 write!(f, "coffset: {}, uoffset: {}", coffset, uoffset)
1735 }
1736}
1737
1738#[derive(Default, Debug)]
1739struct CsiIndex {
1740 n_bin: i32,
1741 bins: Vec<CsiBin>,
1742}
1743
1744/// A chunk within a bin in the CSI data structure
1745#[derive(Default, Debug)]
1746pub struct CsiChunk {
1747 pub chunk_beg: VirtualFileOffsets,
1748 pub chunk_end: VirtualFileOffsets,
1749}
1750
1751/// A bin in the CSI data structure
1752#[derive(Default, Debug)]
1753pub struct CsiBin {
1754 bin: u32,
1755 _loffset: VirtualFileOffsets,
1756 n_chunk: i32,
1757 chunks: Vec<CsiChunk>,
1758}
1759
1760impl CsiBin {
1761 /// return a slice of chunks with a bin in the CSI data structure
1762 pub fn chunks(&self) -> &[CsiChunk] {
1763 &self.chunks[..]
1764 }
1765}
1766
1767/// A struct representing CSI index file content
1768#[derive(Default, Debug)]
1769pub struct Csi {
1770 magic: [u8; 4],
1771 min_shift: i32,
1772 depth: i32,
1773 l_aux: i32,
1774 aux: Vec<u8>,
1775 n_ref: i32,
1776 indices: Vec<CsiIndex>,
1777 n_no_coor: Option<u64>,
1778}
1779
1780impl Csi {
1781 /// Create Csi from a path to a `*.csi` file
1782 pub fn from_path(p: impl AsRef<Path>) -> Result<Self> {
1783 let mut csi = Csi::default();
1784 let mut file = smart_reader(p.as_ref())?;
1785 // magic
1786 file.read_exact(csi.magic.as_mut())
1787 .map_err(|e| e.add_context("error in reading csi magic bytes"))?;
1788 if csi.magic != [b'C', b'S', b'I', 1] {
1789 return Err(Error::Io(std::io::Error::new(
1790 io::ErrorKind::Other,
1791 "csi magic is not 'CSI1'".to_owned(),
1792 )));
1793 }
1794 // min_shift
1795 csi.min_shift = file
1796 .read_i32::<LittleEndian>()
1797 .map_err(|e| e.add_context("error in reading csi min_shift field"))?;
1798 // dbg!(csi.min_shift);
1799 // depth
1800 csi.depth = file
1801 .read_i32::<LittleEndian>()
1802 .map_err(|e| e.add_context("error in reading csi depth field"))?;
1803 // dbg!(csi.depth);
1804 // l_aux
1805 csi.l_aux = file
1806 .read_i32::<LittleEndian>()
1807 .map_err(|e| e.add_context("error in reading csi l_aux field"))?;
1808 // dbg!(csi.l_aux);
1809 // aux
1810 csi.aux.resize(csi.l_aux as usize, 0u8);
1811 file.read_exact(csi.aux.as_mut())
1812 .map_err(|e| e.add_context("error in reading csi aux field"))?;
1813 // n_ref
1814 csi.n_ref = file
1815 .read_i32::<LittleEndian>()
1816 .map_err(|e| e.add_context("error in reading csi n_ref field"))?;
1817
1818 // iterate over chromosomes
1819 for _ in 0..csi.n_ref {
1820 let mut idx = CsiIndex {
1821 n_bin: {
1822 file.read_i32::<LittleEndian>()
1823 .map_err(|e| e.add_context("error in reading csi index n_bin field"))?
1824 },
1825 ..Default::default()
1826 };
1827 for _ in 0..idx.n_bin {
1828 let mut bin = CsiBin {
1829 // bin
1830 bin: file
1831 .read_u32::<LittleEndian>()
1832 .map_err(|e| e.add_context("error in reading csi bin bin field"))?,
1833 _loffset: file
1834 .read_u64::<LittleEndian>()
1835 .map_err(|e| {
1836 Error::from(std::io::Error::new(
1837 e.kind(),
1838 "error in reading csi bin loffset field",
1839 ))
1840 })?
1841 .into(),
1842 // n_chunk
1843 n_chunk: file.read_i32::<LittleEndian>().map_err(|e| {
1844 Error::from(std::io::Error::new(
1845 e.kind(),
1846 "error in reading csi bin n_chunk field",
1847 ))
1848 })?,
1849 ..Default::default()
1850 };
1851
1852 for _ in 0..bin.n_chunk {
1853 let chunk = CsiChunk {
1854 chunk_beg: file
1855 .read_u64::<LittleEndian>()
1856 .map_err(|e| {
1857 Error::from(std::io::Error::new(
1858 e.kind(),
1859 "error in reading csi chunk chunk_beg",
1860 ))
1861 })?
1862 .into(),
1863 chunk_end: file
1864 .read_u64::<LittleEndian>()
1865 .map_err(|e| {
1866 Error::from(std::io::Error::new(
1867 e.kind(),
1868 "error in reading csi chunk chunk_end",
1869 ))
1870 })?
1871 .into(),
1872 };
1873 bin.chunks.push(chunk);
1874 }
1875 idx.bins.push(bin);
1876 }
1877 idx.bins.sort_by_key(|x| x.bin);
1878 csi.indices.push(idx);
1879 }
1880 // n_no_coor
1881 csi.n_no_coor = file.read_u64::<LittleEndian>().ok();
1882
1883 Ok(csi)
1884 }
1885
1886 /// Convert positional coordinate range to a bin number
1887 ///
1888 /// `beg`, `end`` coordinates are 0-based. It is exclusive for end.
1889 /// For some reason, not all length bin are searchable.
1890 /// It is seems that setting `end` to `beg` + 1 works well.
1891 // no sure how it works, but it is from CSIv1.pdf c code
1892 pub fn get_bin_id(&self, beg: i64, end: i64) -> u32 {
1893 let mut l = self.depth;
1894 let end = end - 1;
1895 let mut s = self.min_shift;
1896 let mut t = ((1 << (self.depth * 3)) - 1) / 7;
1897
1898 while l > 0 {
1899 // dbg!(s);
1900 if (beg >> s) == (end >> s) {
1901 return (t + (beg >> s)) as u32;
1902 }
1903 s += 3;
1904 t += 1 << (l * 3);
1905 l += 1;
1906 }
1907
1908 0
1909 }
1910
1911 /// Get CsiBin based the chromosome id and bin number.
1912 ///
1913 /// The return CsiBin can provide details of the included chunks.
1914 pub fn get_bin_details(&self, seqid: usize, bin_id: u32) -> Result<&CsiBin> {
1915 // assert!(bin_id <= self.get_bin_limit());
1916 let bins = &self.indices[seqid].bins;
1917 // dbg!(self.indices.len());
1918 // dbg!(bins.len());
1919 let i = bins
1920 .as_slice()
1921 .binary_search_by(|x| x.bin.cmp(&bin_id))
1922 .map_err(|_| Error::CsBinIndexNotFound)?;
1923 // .expect("bin index not found\n");
1924 Ok(&bins[i])
1925 }
1926
1927 /// Get the max possible bin number in theory. Note, the maximum bin may not
1928 /// be present in the Csi index file.
1929 pub fn get_bin_limit(&self) -> u32 {
1930 (1 << (((self.depth + 1) * 3) - 1)) / 7
1931 }
1932}
1933
1934/// BcfReader suitable for read through the BCF file.
1935///
1936/// This assumes that the source reader is in BCF format and stored as BGZF blocks.
1937///
1938/// # Example
1939/// ```
1940/// use bcf_reader::*;
1941/// use std::{
1942/// fs::File,
1943/// io::{BufReader, Seek},
1944/// };
1945/// // underlying reader can be stdin or a file
1946/// let reader = std::fs::File::open("testdata/test3.bcf")
1947/// .map(BufReader::new)
1948/// .unwrap();
1949/// // 1. for sequential decompression (works for data from stdin or a file)
1950/// let reader = flate2::bufread::MultiGzDecoder::new(reader);
1951/// // 2. for parallel decompression (works for data from stdin or a file)
1952/// // however, when data is from stdin, jump is not supported for now.
1953/// // let reader = ParMultiGzipReader::from_reader(reader, 3, None, None);
1954///
1955/// // create bcf reader
1956/// let mut reader = BcfReader::from_reader(reader);
1957/// // read though header
1958/// let _header = reader.read_header();
1959/// // create a reusable record
1960/// let mut record = Record::default();
1961///
1962/// let mut pos_found = vec![];
1963/// // iterate records from the targeted genome interval
1964/// while let Ok(_) = reader.read_record(&mut record) {
1965/// pos_found.push(record.pos() + 1);
1966/// }
1967///
1968/// assert_eq!(pos_found[0], 72);
1969/// assert_eq!(*pos_found.last().unwrap(), 1498841);
1970/// ```
1971pub struct BcfReader<R>
1972where
1973 R: Read,
1974{
1975 inner: R,
1976 header_parsed: bool,
1977}
1978
1979impl<R> BcfReader<R>
1980where
1981 R: Read,
1982{
1983 /// `max_gzip`, the number of gzip blocks to read before batch
1984 /// decompression. See `ParMultiGzipReader::from_reader` (by default or
1985 /// None, 3 gzip buffers will be used.)
1986 pub fn from_reader(reader: R) -> Self {
1987 Self {
1988 inner: reader,
1989 header_parsed: false,
1990 }
1991 }
1992
1993 /// Read the header
1994 pub fn read_header(&mut self) -> Result<Header> {
1995 let header =
1996 Header::from_string(&read_header(&mut self.inner)?).map_err(Error::ParseHeaderError)?;
1997 self.header_parsed = true;
1998 Ok(header)
1999 }
2000
2001 /// Read one record. This should be called after the header is read and parsed.
2002 /// Otherwise, it will panic.
2003 pub fn read_record(&mut self, record: &mut Record) -> Result<()> {
2004 assert!(
2005 self.header_parsed,
2006 "header should be parsed before reading records"
2007 );
2008 record.read(&mut self.inner)
2009 }
2010}
2011
2012/// A genome interval defined by chromosome id, start, and end positions
2013pub struct GenomeInterval {
2014 pub chrom_id: usize,
2015 pub start: i64,
2016 pub end: Option<i64>,
2017}
2018
2019/// IndexedBcfReader allows random access to a specific genome interval of the
2020/// BCF file using a CSI index file. It is an wrapper around
2021/// [`ParMultiGzipReader<BufReader<File>>`] to allow parallelizable bgzip
2022/// decompression.
2023///
2024/// The source should be BCF file consisting of small BGZF blocks.
2025///
2026/// # Example
2027///
2028/// ```
2029/// use bcf_reader::*;
2030/// // create indexed bcf reader
2031/// let mut reader =
2032/// IndexedBcfReader::from_path("testdata/test3.bcf", "testdata/test3.bcf.csi", None).unwrap();
2033/// // read though header
2034/// let _header = reader.read_header();
2035/// // define targeted genome interval
2036/// let interval = GenomeInterval {
2037/// chrom_id: 0,
2038/// start: 1489230 - 1,
2039/// end: Some(1498509 - 1),
2040/// };
2041/// // set interval
2042/// reader.set_interval(interval);
2043/// // create a reusable record
2044/// let mut record = Record::default();
2045///
2046/// let mut pos_found = vec![];
2047/// // iterate records from the targeted genome interval
2048/// while let Ok(_) = reader.read_record(&mut record) {
2049/// pos_found.push(record.pos() + 1);
2050/// }
2051///
2052/// assert_eq!(
2053/// pos_found,
2054/// vec![
2055/// 1489230, 1489979, 1490069, 1490311, 1492233, 1492337, 1493505, 1494178, 1495207,
2056/// 1495403, 1495746, 1496047, 1497964, 1498188,
2057/// ]
2058/// )
2059/// ```
2060pub struct IndexedBcfReader {
2061 inner: ParMultiGzipReader<BufReader<File>>,
2062 csi: Csi,
2063 header_parsed: bool,
2064 genome_interval: Option<GenomeInterval>,
2065}
2066
2067impl IndexedBcfReader {
2068 /// Create an IndexedBcfReader from paths to a bcf file and a corresponding
2069 /// csi index file.
2070 ///
2071 /// - `max_gzip`, the number of gzip blocks to read before each batch
2072 /// parallelized decompression. See [`ParMultiGzipReader::from_reader`]
2073 /// (by default (None) use 3); this construct will automaticall read and
2074 /// parse the header
2075 pub fn from_path(
2076 path_bcf: impl AsRef<Path>,
2077 path_csi: impl AsRef<Path>,
2078 max_gzip: Option<usize>,
2079 ) -> Result<Self> {
2080 let reader = File::open(path_bcf.as_ref()).map(BufReader::new)?;
2081 let csi = Csi::from_path(path_csi.as_ref())?;
2082 let reader = ParMultiGzipReader::from_reader(reader, max_gzip.unwrap_or(3), None, None)?;
2083 Ok(Self {
2084 inner: reader,
2085 csi,
2086 header_parsed: false,
2087 genome_interval: None,
2088 })
2089 }
2090 /// Read the header bytes, parse them and return a `Header`
2091 pub fn read_header(&mut self) -> Result<Header> {
2092 let header =
2093 Header::from_string(&read_header(&mut self.inner)?).map_err(Error::ParseHeaderError)?;
2094 self.header_parsed = true;
2095 Ok(header)
2096 }
2097
2098 /// Jump the file pointer to the begining to the targeted genome interval
2099 ///
2100 /// If no site within the genome interval, read_record will return Err(_)
2101 pub fn set_interval(&mut self, genome_interval: GenomeInterval) -> Result<()> {
2102 // find the target based on csi
2103 let bin_id = self
2104 .csi
2105 .get_bin_id(genome_interval.start, genome_interval.start + 1);
2106
2107 let (coffset, uoffset) = self
2108 .csi
2109 .get_bin_details(genome_interval.chrom_id, bin_id)?
2110 .chunks[0]
2111 .chunk_beg
2112 .get_coffset_uoffset();
2113
2114 let par_reader = &mut self.inner;
2115 par_reader.inner.seek(io::SeekFrom::Start(coffset))?;
2116
2117 // clear buffer, especially things related to coffset
2118 par_reader.buffer.iter_mut().for_each(|bgzf_buffer| {
2119 bgzf_buffer.compressed.clear();
2120 bgzf_buffer.uncompressed.clear();
2121 bgzf_buffer.coffset = 0;
2122 bgzf_buffer.gzip_size = 0;
2123 bgzf_buffer.uncompressed_data_size = 0;
2124 });
2125 par_reader.ngzip = 0;
2126 par_reader.igzip = 0;
2127 par_reader.ibyte = 0;
2128
2129 // fill buffer
2130 par_reader.clear_and_fill_buffers()?;
2131 par_reader.decomp_all()?;
2132
2133 // jump for uoffset
2134 par_reader.ibyte = uoffset as usize;
2135
2136 self.genome_interval = Some(genome_interval);
2137 Ok(())
2138 }
2139
2140 /// Read one record. Should be called after header is parsed.
2141 ///
2142 /// If `set_interval` has been called, only records within the given interval
2143 /// will be read.
2144 pub fn read_record(&mut self, record: &mut Record) -> Result<()> {
2145 if !self.header_parsed {
2146 return Err(Error::HeaderNotParsed);
2147 }
2148
2149 let interval = self
2150 .genome_interval
2151 .as_ref()
2152 .ok_or(Error::IndexBcfReaderMissingGenomeInterval)?;
2153 let start = interval.start;
2154 let end = interval.end;
2155 loop {
2156 match record.read(&mut self.inner) {
2157 Ok(_) => {
2158 if let Some(end) = end {
2159 if record.pos as i64 >= end {
2160 let e =
2161 std::io::Error::new(std::io::ErrorKind::NotFound, "out of range");
2162 Err(e)?;
2163 }
2164 }
2165 if record.pos as i64 >= start {
2166 return Ok(());
2167 }
2168 }
2169 Err(e) => return Err(e),
2170 }
2171 }
2172 }
2173}