bcf_reader/
lib.rs

1//! # bcf_reader
2//! This is an attempt to create a small, lightweight, pure-Rust library to allow
3//! efficient, cross-platform access to genotype data in BCF files.
4//!
5//! Currently, the `rust_htslib` crate works only on Linux and macOS (not Windows?).
6//! The noodles crate is a pure Rust library for many bioinformatic file formats and
7//! works across Windows, Linux, and macOS. However, the `noodles` API for reading
8//! genotype data from BCF files can be slow due to its memory allocation patterns.
9//! Additionally, both crates have a large number of dependencies, as they provide
10//! many features and support a wide range of file formats.
11//!
12//! One way to address the memory allocation and dependency issues is to manually
13//! parse BCF records according to its specification
14//! (`<https://samtools.github.io/hts-specs/VCFv4.2.pdf>`) and use iterators whenever
15//! possible, especially for the per-sample fields, like GT and AD.
16//!
17//! Note: This crate is in its early stages of development.
18//!
19//! ## Usage
20//! ```
21//! use bcf_reader::*;
22//! let mut reader = smart_reader("testdata/test2.bcf").unwrap();
23//! let header = Header::from_string(&read_header(&mut reader).unwrap()).unwrap();
24//! // find key for a field in INFO or FORMAT or FILTER
25//! let key = header.get_idx_from_dictionary_str("FORMAT", "GT").unwrap();
26//! // access header dictionary
27//! let d = &header.dict_strings()[&key];
28//! assert_eq!(d["ID"], "GT");
29//! assert_eq!(d["Dictionary"], "FORMAT");
30//! /// get chromosome name
31//! assert_eq!(header.get_chrname(0), "Pf3D7_01_v3");
32//! let fmt_ad_key = header
33//!     .get_idx_from_dictionary_str("FORMAT", "AD")
34//!     .expect("FORMAT/AD not found");
35//! let info_af_key = header
36//!     .get_idx_from_dictionary_str("INFO", "AF")
37//!     .expect("INFO/AF not found");
38//!
39//! // this can be and should be reused to reduce allocation
40//! let mut record = Record::default();
41//! while let Ok(_) = record.read(&mut reader) {
42//!     let pos = record.pos();
43//!
44//!     // use byte ranges and shared buffer to get allele string values
45//!     let allele_byte_ranges = record.alleles();
46//!     let share_buf = record.buf_shared();
47//!     let ref_rng = &allele_byte_ranges[0];
48//!     let ref_allele_str = std::str::from_utf8(&share_buf[ref_rng.clone()]).unwrap();
49//!     let alt1_rng = &allele_byte_ranges[1];
50//!     let alt1_allele_str = std::str::from_utf8(&share_buf[alt1_rng.clone()]).unwrap();
51//!     // ...
52//!
53//!     // access FORMAT/GT via iterator
54//!     for nv in record.fmt_gt(&header) {
55//!         let nv = nv.unwrap();
56//!         let (has_no_ploidy, is_missing, is_phased, allele_idx) = nv.gt_val();
57//!         // ...
58//!     }
59//!
60//!     // access FORMAT/AD via iterator
61//!     for nv in record.fmt_field(fmt_ad_key) {
62//!         let nv = nv.unwrap();
63//!         match nv.int_val() {
64//!             None => {}
65//!             Some(ad) => {
66//!                 // ...
67//!             }
68//!         }
69//!         // ...
70//!     }
71//!
72//!     // access FILTERS via itertor
73//!     record.filters().for_each(|nv| {
74//!         let nv = nv.unwrap();
75//!         let filter_key = nv.int_val().unwrap() as usize;
76//!         let dict_string_map = &header.dict_strings()[&filter_key];
77//!         let filter_name = &dict_string_map["ID"];
78//!         // ...
79//!     });
80//!
81//!     // access INFO/AF via itertor
82//!     record.info_field_numeric(info_af_key).for_each(|nv| {
83//!         let nv = nv.unwrap();
84//!         let af = nv.float_val().unwrap();
85//!         // ...
86//!     });
87//! }
88//! ```
89//!
90//! More examples to access each field/column are available in docs of [`Record`] and [`Header`].
91//!
92//! # Reader types
93//! - For parallelized decompression reader, see [`BcfReader`].
94//! - For parallelized indexed reader, see [ `IndexedBcfReader`].
95//! - For the Lower-level reader underlying `BcfReader` and `IndexedBcfReader`,
96//!   see [`ParMultiGzipReader`].
97//!
98//! # `flate2` backends
99//!
100//! By default, a rust backend is used. Other `flate2` backends `zlib` and
101//! `zlib-ng-compat` has been exported as the corresponding features (`zlib` and
102//! `zlib-ng-compat`). See <https://docs.rs/flate2/latest/flate2/> for more details.
103//!
104#![warn(clippy::unwrap_used, clippy::expect_used, clippy::dbg_macro)]
105use byteorder::{LittleEndian, ReadBytesExt};
106use flate2::bufread::DeflateDecoder;
107use rayon::prelude::*;
108use std::fmt::Debug;
109use std::fs::File;
110use std::io;
111use std::io::BufReader;
112use std::io::Read;
113use std::ops::Range;
114use std::path::Path;
115use std::{collections::HashMap, io::Seek};
116
117pub type Result<T> = std::result::Result<T, Error>;
118
119#[derive(Debug)]
120pub enum Error {
121    Io(std::io::Error),
122    HeaderNotParsed,
123    IndexBcfReaderMissingGenomeInterval,
124    CsBinIndexNotFound,
125    FromUtf8Error(std::string::FromUtf8Error),
126    Utf8Error(std::str::Utf8Error),
127    ParseHeaderError(ParseHeaderError),
128    NumericaValueEmptyInt,
129    NumericaValueAsF32Error,
130    Other(String),
131}
132
133#[derive(Debug)]
134pub struct ParseGzipHeaderError {
135    pub key: &'static str,
136    pub expected: usize,
137    pub found: usize,
138}
139
140#[derive(Debug)]
141pub enum ParseHeaderError {
142    HeaderCommentCharError,
143    MissingDictionaryname,
144    FormatError(&'static str),
145}
146
147trait AddContext {
148    fn add_context(self, context: &'static str) -> Error;
149}
150impl AddContext for std::io::Error {
151    fn add_context(self, context: &'static str) -> Error {
152        Error::from(std::io::Error::new(self.kind(), context))
153    }
154}
155
156// --- Display
157impl std::fmt::Display for Error {
158    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
159        write!(f, "{:?}", *self)
160    }
161}
162impl std::fmt::Display for ParseHeaderError {
163    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
164        write!(f, "{:?}", *self)
165    }
166}
167impl std::fmt::Display for ParseGzipHeaderError {
168    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169        write!(f, "{:?}", *self)
170    }
171}
172
173// --- Error
174
175impl std::error::Error for Error {}
176
177impl std::error::Error for ParseGzipHeaderError {}
178
179// --- From
180impl From<ParseGzipHeaderError> for std::io::Error {
181    fn from(value: ParseGzipHeaderError) -> Self {
182        std::io::Error::new(std::io::ErrorKind::InvalidData, Box::new(value))
183    }
184}
185impl From<std::io::Error> for Error {
186    fn from(value: std::io::Error) -> Self {
187        Error::Io(value)
188    }
189}
190
191/// An iterator used to split a `str` by a separator with separators within pairs
192/// of quotes ignored.
193pub struct QuotedSplitter<'a> {
194    data: &'a str,
195    in_quotes: bool,
196    sep: char,
197    quote: char,
198}
199
200impl<'a> QuotedSplitter<'a> {
201    /// Creates a new `QuotedSplitter` iterator.
202    ///
203    /// # Arguments
204    ///
205    /// * `buffer` - The input string to be split.
206    /// * `separator` - The separator character used to split the string.
207    /// * `quote` - The quote character used to ignore separators within quotes.
208    ///
209    /// # Examples
210    ///
211    /// ```
212    /// use bcf_reader::QuotedSplitter;
213    /// let input_string = "hello,\"world, this is fun\",test";
214    /// let result: Vec<_> = QuotedSplitter::new(input_string, ',', '"').collect();
215    /// assert_eq!(result, vec!["hello", "\"world, this is fun\"", "test"]);
216    /// ```
217    pub fn new(buffer: &'a str, separator: char, quote: char) -> Self {
218        Self {
219            data: buffer,
220            in_quotes: false,
221            sep: separator,
222            quote,
223        }
224    }
225}
226
227impl<'a> Iterator for QuotedSplitter<'a> {
228    type Item = &'a str;
229
230    /// Advances the iterator and returns the next split substring.
231    ///
232    /// # Returns
233    ///
234    /// * `Some(&str)` - The next split substring.
235    /// * `None` - If there are no more substrings to split.
236    fn next(&mut self) -> Option<Self::Item> {
237        for (idx, ch) in self.data.char_indices() {
238            if ch == self.quote {
239                self.in_quotes = !self.in_quotes;
240            }
241            if (!self.in_quotes) && ch == self.sep {
242                let (out, remain) = self.data.split_at(idx);
243                self.data = remain.strip_prefix(self.sep).unwrap_or(remain);
244                return Some(out);
245            }
246        }
247        if !self.data.is_empty() {
248            let out = self.data;
249            self.data = "";
250            Some(out)
251        } else {
252            None
253        }
254    }
255}
256
257/// Represents a header of a BCF file.
258///
259/// The `Header` struct contains information about the dictionar of strings and
260/// contigs, samples, and the index of the FORMAT field "GT".  It provides
261/// methods to parse a header from a string, retrieve the index of a dictionary
262/// string, retrieve the chromosome name by index, retrieve the index of the
263/// FORMAT field "GT", and access the dictionary strings, contigs, and samples.
264///
265/// # Examples
266///
267/// ```
268/// use bcf_reader::Header;
269///
270/// let header_text = concat!(
271///     r#"##fileformat=VCFv4.3"#,
272///     "\n",
273///     r#"##FILTER=<ID=PASS,Description="All filters passed">"#,
274///     "\n",
275///     r#"##FILTER=<ID=FAILED1,Description="failed due to something">"#,
276///     "\n",
277///     r#"##contig=<ID=chr1,length=123123123>"#,
278///     "\n",
279///     r#"##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">"#,
280///     "\n",
281///     r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#,
282///     "\n",
283///     "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1\tsample2",
284///     "\n",
285/// );
286///
287/// let header = Header::from_string(&header_text).unwrap();
288///
289/// assert_eq!(header.get_idx_from_dictionary_str("INFO", "DP"), Some(2));
290/// assert_eq!(header.get_chrname(0), "chr1");
291/// assert_eq!(header.get_fmt_gt_id(), Some(3));
292/// assert_eq!(header.dict_contigs().len(), 1);
293/// assert_eq!(header.dict_strings().len(), 4);
294/// assert_eq!(header.get_samples().len(), 2);
295/// ```
296#[derive(Debug)]
297pub struct Header {
298    dict_strings: HashMap<usize, HashMap<String, String>>,
299    dict_contigs: HashMap<usize, HashMap<String, String>>,
300    samples: Vec<String>,
301    fmt_gt_idx: Option<usize>,
302}
303impl Header {
304    /// parse header lines to structured data `Header`
305    pub fn from_string(text: &str) -> std::result::Result<Self, ParseHeaderError> {
306        use ParseHeaderError::*;
307        let mut dict_strings = HashMap::<usize, HashMap<String, String>>::new();
308        let mut dict_contigs = HashMap::<usize, HashMap<String, String>>::new();
309        let mut samples = Vec::<String>::new();
310
311        // implicit FILTER/PASS header line
312        let mut m = HashMap::<String, String>::new();
313        m.insert("Dictionary".into(), "FILTER".into());
314        m.insert("ID".into(), "PASS".into());
315        m.insert("Description".into(), r#""All filters passed""#.into());
316        dict_strings.insert(0, m);
317        //
318        let mut dict_str_idx_counter = 1;
319        let mut dict_contig_idx_counter = 0;
320        for line in QuotedSplitter::new(text.trim_end_matches('\0').trim(), '\n', '"') {
321            if line.starts_with("#CHROM") {
322                line.split("\t")
323                    .skip(9)
324                    .for_each(|s| samples.push(s.into()));
325                continue;
326            }
327            if line.trim().is_empty() {
328                continue;
329            }
330            let mut it = QuotedSplitter::new(
331                line.strip_prefix("##").ok_or(HeaderCommentCharError)?,
332                '=',
333                '"',
334            );
335            let dict_name = it.next().ok_or(MissingDictionaryname)?;
336            let valid_dict = matches!(it.next(), Some(x) if x.starts_with("<"));
337            if !valid_dict {
338                continue;
339            }
340            let l = line.find('<').ok_or(FormatError("'<' not found"))?;
341            let s = line.split_at(l + 1).1;
342            let r = s.rfind('>').ok_or(FormatError("> not found"))?;
343            let s = s.split_at(r).0;
344            let mut m = HashMap::<String, String>::new();
345            for kv_str in QuotedSplitter::new(s, ',', '"') {
346                let kv_str = kv_str.trim();
347
348                let mut it = QuotedSplitter::new(kv_str, '=', '"');
349                let k = it.next().ok_or(FormatError("key not found"))?;
350                let v = it
351                    .next()
352                    .ok_or(FormatError("value not found"))?
353                    .trim_end_matches('"')
354                    .trim_start_matches('"');
355                m.insert(k.into(), v.into());
356            }
357            match dict_name {
358                "contig" => {
359                    if m.contains_key("IDX") {
360                        let idx: usize = m["IDX"]
361                            .parse()
362                            .map_err(|_| FormatError("IDX value parsing error"))?;
363                        if dict_contig_idx_counter != 0 {
364                            // if one dict string has IDX all of them should have IDX in the dictionary
365                            return Err(FormatError("not all CONTIG lines have key IDX"));
366                        }
367                        dict_contigs.insert(idx, m);
368                    } else {
369                        dict_contigs.insert(dict_contig_idx_counter, m);
370                        dict_contig_idx_counter += 1;
371                    }
372                }
373                _ => {
374                    if ((dict_name != "FILTER") || (&m["ID"] != "PASS"))
375                        && ["INFO", "FILTER", "FORMAT"].iter().any(|x| *x == dict_name)
376                    {
377                        m.insert("Dictionary".into(), dict_name.into());
378                        // dbg!(&m, dict_str_idx_counter);
379                        if m.contains_key("IDX") {
380                            let idx: usize = m["IDX"]
381                                .parse()
382                                .map_err(|_| FormatError("IDX value parsing error"))?;
383                            if dict_str_idx_counter != 1 {
384                                // if one dict string has IDX all of them should have IDX in the dictionary
385                                return Err(FormatError(
386                                    "not all INFO/FILTER/FORMAT lines have key IDX",
387                                ));
388                            }
389                            dict_strings.insert(idx, m);
390                        } else {
391                            dict_strings.insert(dict_str_idx_counter, m);
392                            dict_str_idx_counter += 1;
393                        }
394                    }
395                }
396            };
397        }
398
399        // find fmt_key for FORMAT/GT for convenience
400        let mut fmt_gt_idx = None;
401        for (k, m) in dict_strings.iter() {
402            if (&m["Dictionary"] == "FORMAT") && (&m["ID"] == "GT") {
403                fmt_gt_idx = Some(*k);
404            }
405        }
406
407        Ok(Self {
408            dict_strings,
409            dict_contigs,
410            samples,
411            fmt_gt_idx,
412        })
413    }
414
415    /// Find the key (offset in header line) for a given INFO/xx or FILTER/xx or FORMAT/xx field.
416    ///
417    /// Example:
418    /// ```
419    /// use bcf_reader::*;
420    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
421    /// let s = read_header(&mut f).unwrap();
422    /// let header = Header::from_string(&s).unwrap();
423    /// let key_found = header.get_idx_from_dictionary_str("FORMAT", "GT").unwrap();
424    /// assert_eq!(key_found, header.get_fmt_gt_id().unwrap());
425    /// ```
426    pub fn get_idx_from_dictionary_str(&self, dictionary: &str, field: &str) -> Option<usize> {
427        for (k, m) in self.dict_strings.iter() {
428            if (m["Dictionary"] == dictionary) && (m["ID"] == field) {
429                return Some(*k);
430            }
431        }
432        None
433    }
434
435    /// Get chromosome name from the contig index
436    pub fn get_chrname(&self, idx: usize) -> &str {
437        &self.dict_contigs[&idx]["ID"]
438    }
439
440    /// Get key for FORMAT/GT field.
441    pub fn get_fmt_gt_id(&self) -> Option<usize> {
442        self.fmt_gt_idx
443    }
444
445    /// Get hashmap of hashmap of dictionary of contigs
446    /// outer key: contig_idx
447    /// inner key: is the key of the dictionary of contig, such as 'ID', 'Description'
448    pub fn dict_contigs(&self) -> &HashMap<usize, HashMap<String, String>> {
449        &self.dict_contigs
450    }
451
452    /// Get hashmap of hashmap of dictionary of strings
453    /// outer key: item_idx, for FILTER/xx, FORMAT/xx, INFO/xx,
454    /// inner key: is the key of the dictionary of string, such as 'ID', 'Description'
455    pub fn dict_strings(&self) -> &HashMap<usize, HashMap<String, String>> {
456        &self.dict_strings
457    }
458
459    /// Get samples names from sample idx
460    /// Example:
461    /// ```
462    /// use bcf_reader::*;
463    /// // read data generated by bcftools
464    /// // bcftools query -l test.bcf | bgzip -c > test_samples.gz
465    /// let mut samples_str = String::new();
466    /// smart_reader("./testdata/test_samples.gz")
467    ///     .unwrap()
468    ///     .read_to_string(&mut samples_str)
469    ///     .unwrap();
470    /// // read data via bcf-reader
471    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
472    /// let s = read_header(&mut f).unwrap();
473    /// let header = Header::from_string(&s).unwrap();
474    /// let samples_str2 = header.get_samples().join("\n");
475    /// // compare bcftools results and bcf-reader results
476    /// assert_eq!(samples_str.trim(), samples_str2.trim());
477    /// ```
478    pub fn get_samples(&self) -> &Vec<String> {
479        &self.samples
480    }
481}
482
483/// map bcf2 type to width in bytes
484///
485/// `typ`:
486/// - 0: MISSING
487/// - 1: u8 (1 byte)
488/// - 2: u16 (2 bytes)
489/// - 3: u32 (3 bytes)
490/// - 5: f32 (4 bytes)
491/// - 7: c-char (u8, 1 byte)
492pub fn bcf2_typ_width(typ: u8) -> usize {
493    match typ {
494        0x0 => 0,
495        0x1 => 1,
496        0x2 => 2,
497        0x3 => 4,
498        0x5 => 4,
499        0x7 => 1,
500        _ => panic!(),
501    }
502}
503
504#[derive(Debug, PartialEq)]
505/// Represents a numeric value in the context of the bcf-reader.
506pub enum NumericValue {
507    /// Represents an unsigned 8-bit integer value.
508    U8(u8),
509    /// Represents an unsigned 16-bit integer value.
510    U16(u16),
511    /// Represents an unsigned 32-bit integer value.
512    U32(u32),
513    /// Represents a 32-bit floating-point value. (Note that a u32 is used to
514    /// hold the bits for the f32 value)
515    F32(u32),
516}
517
518impl Default for NumericValue {
519    fn default() -> Self {
520        NumericValue::U8(0)
521    }
522}
523
524impl From<u8> for NumericValue {
525    fn from(value: u8) -> Self {
526        Self::U8(value)
527    }
528}
529impl From<u16> for NumericValue {
530    fn from(value: u16) -> Self {
531        Self::U16(value)
532    }
533}
534impl From<u32> for NumericValue {
535    fn from(value: u32) -> Self {
536        Self::U32(value)
537    }
538}
539
540impl NumericValue {
541    fn is_missing(&self) -> bool {
542        match *self {
543            NumericValue::U8(x) => x == 0x80,
544            NumericValue::U16(x) => x == 0x8000,
545            NumericValue::U32(x) => x == 0x80000000,
546            NumericValue::F32(x) => x == 0x7F800001,
547        }
548    }
549
550    fn is_end_of_vector(&self) -> bool {
551        match *self {
552            NumericValue::U8(x) => x == 0x81,
553            NumericValue::U16(x) => x == 0x8001,
554            NumericValue::U32(x) => x == 0x80000001,
555            NumericValue::F32(x) => x == 0x7F800002,
556        }
557    }
558
559    fn as_f32(&self) -> Result<Self> {
560        match *self {
561            NumericValue::U32(x) => Ok(NumericValue::F32(x)),
562            _ => Err(Error::NumericaValueAsF32Error),
563        }
564    }
565
566    /// Returns the integer value if the NumericValue is an unsigned integer and not missing.
567    ///
568    /// # Examples
569    ///
570    /// ```
571    /// use bcf_reader::NumericValue;
572    ///
573    /// let value = NumericValue::U8(42);
574    /// assert_eq!(value.int_val(), Some(42u32));
575    ///
576    /// let missing_value = NumericValue::U8(0x80u8);
577    /// assert_eq!(missing_value.int_val(), None);
578    /// ```
579    pub fn int_val(&self) -> Option<u32> {
580        if self.is_end_of_vector() || self.is_missing() {
581            None
582        } else {
583            match *self {
584                Self::U8(x) => Some(x as u32),
585                Self::U16(x) => Some(x as u32),
586                Self::U32(x) => Some(x),
587                _ => None,
588            }
589        }
590    }
591
592    /// Returns the floating-point value if the NumericValue is a 32-bit float and not missing.
593    ///
594    /// # Examples
595    ///
596    /// ```
597    /// use bcf_reader::NumericValue;
598    ///
599    /// let value = NumericValue::F32(3.14f32.to_bits());
600    /// assert_eq!(value.float_val(), Some(3.14f32));
601    ///
602    /// let missing_value = NumericValue::F32(0x7F800001);
603    /// let missing_value2 = NumericValue::F32(0x7F800001);
604    /// assert_eq!(missing_value.float_val(), missing_value2.float_val());
605    /// dbg!(&missing_value);
606    /// assert_eq!(missing_value.float_val(), None);
607    /// ```
608    pub fn float_val(&self) -> Option<f32> {
609        if self.is_end_of_vector() || self.is_missing() {
610            None
611        } else {
612            match *self {
613                Self::F32(x) => Some(f32::from_bits(x)),
614                _ => panic!(),
615            }
616        }
617    }
618
619    /// Returns a tuple representing the GT value.
620    ///
621    /// The tuple contains the following elements:
622    /// - `noploidy`: A boolean indicating if the ploidy is missing (for individuals with fewer ploids compared to individuals with the maximum ploidy).
623    /// - `dot`: A boolean indicating if the genotype call is a dot.
624    /// - `phased`: A boolean indicating if the allele is phased (the first allele of a call is always unphased).
625    /// - `allele`: The allele value (index).
626    ///
627    /// # Examples
628    ///
629    /// ```
630    /// use bcf_reader::NumericValue;
631    ///
632    /// let value = NumericValue::U8(3);
633    /// assert_eq!(value.gt_val(), (false, false, true, 0));
634    ///
635    /// let value = NumericValue::U8(5);
636    /// assert_eq!(value.gt_val(), (false, false, true, 1));
637    ///
638    /// let missing_value = NumericValue::U8(0);
639    /// assert_eq!(missing_value.gt_val(), (false, true, false, u32::MAX));
640    /// ```
641    pub fn gt_val(&self) -> (bool, bool, bool, u32) {
642        let mut noploidy = false;
643        let mut dot = false;
644        let mut phased = false;
645        let mut allele = u32::MAX;
646
647        match self.int_val() {
648            None => {
649                noploidy = true;
650            }
651            Some(int_val) => {
652                phased = (int_val & 0x1) != 0;
653
654                let int_val = int_val >> 1;
655                if int_val == 0 {
656                    dot = true;
657                } else {
658                    allele = int_val - 1;
659                }
660            }
661        };
662
663        (noploidy, dot, phased, allele)
664    }
665}
666
667/// Read typed descriptor from the reader (of decompressed BCF buffer)
668///
669/// Return `typ` for type and `n` for count of elements of the type.
670pub fn read_typed_descriptor_bytes<R>(reader: &mut R) -> std::io::Result<(u8, usize)>
671where
672    R: std::io::Read + ReadBytesExt,
673{
674    let tdb = reader.read_u8()?;
675    let typ = tdb & 0xf;
676    let mut n = (tdb >> 4) as usize;
677    if n == 15 {
678        n = read_single_typed_integer(reader)? as usize;
679    }
680    Ok((typ, n))
681}
682
683/// Read a single typed integer from the reader (of decompressed BCF buffer)
684pub fn read_single_typed_integer<R>(reader: &mut R) -> std::io::Result<u32>
685where
686    R: std::io::Read + ReadBytesExt,
687{
688    let (typ, n) = read_typed_descriptor_bytes(reader)?;
689    assert_eq!(n, 1);
690    Ok(match typ {
691        1 => reader.read_u8()? as u32,
692        2 => reader.read_u16::<LittleEndian>()? as u32,
693        3 => reader.read_u32::<LittleEndian>()?,
694        _ => panic!(),
695    })
696}
697
698/// Iterator for accessing arrays of numeric values (integers or floats)
699/// directly from the buffer bytes without building Vec<_> or Vec<Vec<_>>
700/// for each site.
701#[derive(Default, Debug)]
702pub struct NumericValueIter<'r> {
703    reader: std::io::Cursor<&'r [u8]>,
704    typ: u8,
705    len: usize,
706    cur: usize,
707}
708
709impl<'r> Iterator for NumericValueIter<'r> {
710    type Item = std::io::Result<NumericValue>;
711    fn next(&mut self) -> Option<Self::Item> {
712        if self.cur >= self.len {
713            None
714        } else {
715            match self.typ {
716                0 => None,
717                1 => {
718                    self.cur += 1;
719                    Some(self.reader.read_u8().map(NumericValue::from))
720                }
721                2 => {
722                    self.cur += 1;
723                    Some(
724                        self.reader
725                            .read_u16::<LittleEndian>()
726                            .map(NumericValue::from),
727                    )
728                }
729                3 => {
730                    self.cur += 1;
731                    Some(
732                        self.reader
733                            .read_u32::<LittleEndian>()
734                            .map(NumericValue::from),
735                    )
736                }
737                5 => {
738                    self.cur += 1;
739                    let val_res = match self
740                        .reader
741                        .read_u32::<LittleEndian>()
742                        .map(NumericValue::from)
743                    {
744                        Ok(nv) => match nv.as_f32() {
745                            Ok(nv) => Ok(nv),
746                            Err(_e) => Err(std::io::Error::new(
747                                std::io::ErrorKind::Other,
748                                Error::NumericaValueAsF32Error,
749                            )),
750                        },
751                        Err(e) => Err(e),
752                    };
753
754                    Some(val_res)
755                }
756                _ => panic!(),
757            }
758        }
759    }
760}
761
762/// Generate an iterator of numbers from a continuous bytes buffer
763/// - typ: data type byte
764/// - n: total number of elements to iterate
765/// - buffer: the bytes buffer
766pub fn iter_typed_integers(typ: u8, n: usize, buffer: &[u8]) -> NumericValueIter {
767    NumericValueIter {
768        reader: std::io::Cursor::new(buffer),
769        typ,
770        len: n,
771        cur: 0,
772    }
773}
774
775/// Read a typed string from the reader to a Rust String
776pub fn read_typed_string<R>(reader: &mut R, buffer: &mut Vec<u8>) -> std::io::Result<usize>
777where
778    R: std::io::Read + ReadBytesExt,
779{
780    let (typ, n) = read_typed_descriptor_bytes(reader)?;
781    assert_eq!(typ, 0x7);
782    let s = buffer.len();
783    buffer.resize(s + n, b'\0');
784    reader.read_exact(&mut buffer.as_mut_slice()[s..s + n])?;
785    Ok(n)
786}
787
788/// read the header lines to a String
789/// use Header::from_string(text) to convert the string into structured data
790pub fn read_header<R>(reader: &mut R) -> Result<String>
791where
792    R: std::io::Read + ReadBytesExt,
793{
794    // read magic
795    let mut magic = [0u8; 3];
796    reader.read_exact(&mut magic)?;
797    assert_eq!(&magic, b"BCF");
798
799    // read major verion and minor version
800    let major = reader.read_u8()?;
801    let minor = reader.read_u8()?;
802    assert_eq!(major, 2);
803    assert_eq!(minor, 2);
804
805    // read text length
806    let l_length = reader.read_u32::<LittleEndian>()?;
807    let mut text = vec![0u8; l_length as usize];
808    reader.read_exact(&mut text)?;
809
810    String::from_utf8(text).map_err(Error::FromUtf8Error)
811}
812
813/// Represents a record (a line or a site) in BCF file
814#[derive(Default, Debug)]
815pub struct Record {
816    buf_shared: Vec<u8>,
817    buf_indiv: Vec<u8>,
818    chrom: i32,
819    pos: i32,
820    rlen: i32,
821    qual: NumericValue,
822    n_info: u16,
823    n_allele: u16,
824    n_sample: u32,
825    n_fmt: u8,
826    id: Range<usize>,
827    alleles: Vec<Range<usize>>,
828    /// (typ, n, byte_range)
829    filters: (u8, usize, Range<usize>),
830    /// (info_key, typ, n, byte_range)
831    info: Vec<(usize, u8, usize, Range<usize>)>,
832    /// (fmt_key, typ, n, byte_range)
833    gt: Vec<(usize, u8, usize, Range<usize>)>,
834}
835impl Record {
836    /// read a record (copy bytes from the reader to the record's interval
837    /// buffers), and separate fields
838    pub fn read<R>(&mut self, reader: &mut R) -> Result<()>
839    where
840        R: std::io::Read + ReadBytesExt,
841    {
842        let l_shared = match reader.read_u32::<LittleEndian>() {
843            Ok(x) => x,
844            Err(_x) => Err(_x)?,
845        };
846        let l_indv = reader.read_u32::<LittleEndian>()?;
847        // dbg!(l_shared, l_indv);
848        self.buf_shared.resize(l_shared as usize, 0u8);
849        self.buf_indiv.resize(l_indv as usize, 0u8);
850        reader.read_exact(self.buf_shared.as_mut_slice())?;
851        reader.read_exact(self.buf_indiv.as_mut_slice())?;
852        self.parse_shared()?;
853        self.parse_indv()?;
854        // dbg!(self.pos);
855        Ok(())
856    }
857    /// parse shared fields
858    fn parse_shared(&mut self) -> std::io::Result<()> {
859        let mut reader = std::io::Cursor::new(self.buf_shared.as_slice());
860        self.chrom = reader.read_i32::<LittleEndian>()?;
861        self.pos = reader.read_i32::<LittleEndian>()?;
862        self.rlen = reader.read_i32::<LittleEndian>()?;
863        let qual_u32 = reader.read_u32::<LittleEndian>()?;
864        self.qual = NumericValue::from(qual_u32)
865            .as_f32()
866            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
867        self.n_info = reader.read_u16::<LittleEndian>()?;
868        self.n_allele = reader.read_u16::<LittleEndian>()?;
869        let combined = reader.read_u32::<LittleEndian>()?;
870        self.n_sample = combined & 0xffffff;
871        self.n_fmt = (combined >> 24) as u8;
872        // id
873        let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
874        assert_eq!(typ, 0x7);
875        let cur = reader.position() as usize;
876        self.id = cur..cur + n;
877        reader.seek(std::io::SeekFrom::Current(n as i64))?;
878        // alleles
879        self.alleles.clear();
880        for _ in 0..self.n_allele {
881            let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
882            assert_eq!(typ, 0x7);
883            let cur = reader.position() as usize;
884            self.alleles.push(cur..cur + n);
885            reader.seek(std::io::SeekFrom::Current(n as i64))?;
886        }
887        //filters
888        let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
889        let width: usize = bcf2_typ_width(typ);
890        let s = reader.position() as usize;
891        let e = s + width * n;
892        reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
893        self.filters = (typ, n, s..e);
894        // infos
895        self.info.clear();
896        for _idx in 0..(self.n_info as usize) {
897            let info_key = read_single_typed_integer(&mut reader)?;
898            let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
899            let width = bcf2_typ_width(typ);
900            let s = reader.position() as usize;
901            let e = s + width * n;
902            reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
903            self.info.push((info_key as usize, typ, n, s..e));
904        }
905        Ok(())
906    }
907    /// parse indiv fields, complicated field will need further processing
908    fn parse_indv(&mut self) -> std::io::Result<()> {
909        let mut reader = std::io::Cursor::new(self.buf_indiv.as_slice());
910        self.gt.clear();
911        for _idx in 0..(self.n_fmt as usize) {
912            let fmt_key = read_single_typed_integer(&mut reader)?;
913            let (typ, n) = read_typed_descriptor_bytes(&mut reader)?;
914            let width = bcf2_typ_width(typ);
915            let s = reader.position() as usize;
916            let e = s + width * self.n_sample as usize * n;
917            reader.seek(std::io::SeekFrom::Current((e - s) as i64))?;
918            self.gt.push((fmt_key as usize, typ, n, s..e));
919        }
920        Ok(())
921    }
922
923    /// get chromosome offset
924    /// Example:
925    /// ```
926    /// use bcf_reader::*;
927    /// use std::io::Write;
928    /// // read data generated by bcftools
929    /// // bcftools query -f '%CHROM\n' test.bcf | bgzip -c > test_chrom.gz
930    /// let mut chrom_str = String::new();
931    /// smart_reader("testdata/test_chrom.gz")
932    ///     .unwrap()
933    ///     .read_to_string(&mut chrom_str)
934    ///     .unwrap();
935    /// // read data via bcf-reader
936    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
937    /// let s = read_header(&mut f).unwrap();
938    /// let header = Header::from_string(&s).unwrap();
939    /// let mut record = Record::default();
940    /// let mut chrom_str2 = Vec::<u8>::new();
941    /// while let Ok(_) = record.read(&mut f) {
942    ///     write!(
943    ///         chrom_str2,
944    ///         "{}\n",
945    ///         header.get_chrname(record.chrom() as usize)
946    ///     )
947    ///     .unwrap();
948    /// }
949    /// let chrom_str2 = String::from_utf8(chrom_str2).unwrap();
950    /// // compare bcftools results and bcf-reader results
951    /// assert_eq!(chrom_str, chrom_str2);
952    /// ```
953    pub fn chrom(&self) -> i32 {
954        self.chrom
955    }
956
957    /// Returns the reference length of the record.
958    pub fn rlen(&self) -> i32 {
959        self.rlen
960    }
961
962    /// Returns the quality score of the record, if available
963    pub fn qual(&self) -> Option<f32> {
964        self.qual.float_val()
965    }
966
967    pub fn n_allele(&self) -> u16 {
968        self.n_allele
969    }
970
971    /// Returns an iterator over the genotype values in the record's FORMAT field.
972    /// If no FORMAT/GT field available, the returned iterator will have items.
973    /// Example:
974    /// ```
975    /// use bcf_reader::*;
976    /// use std::io::Write;
977    /// // read data generated by bcftools
978    /// // bcftools query -f '[\t%GT]\n' test.bcf | bgzip -c > test_gt.gz
979    /// let mut gt_str = String::new();
980    /// smart_reader("testdata/test_gt.gz")
981    ///     .unwrap()
982    ///     .read_to_string(&mut gt_str)
983    ///     .unwrap();
984    /// // read data via bcf-reader
985    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
986    /// let s = read_header(&mut f).unwrap();
987    /// let header = Header::from_string(&s).unwrap();
988    /// let mut record = Record::default();
989    /// let mut gt_str2 = Vec::<u8>::new();
990    /// while let Ok(_) = record.read(&mut f) {
991    ///     for (i, bn) in record.fmt_gt(&header).enumerate() {
992    ///         let (noploidy, dot, phased, allele) = bn.unwrap().gt_val();
993    ///         assert_eq!(noploidy, false); // missing ploidy
994    ///         let mut sep = '\t';
995    ///         if i % 2 == 1 {
996    ///             if phased {
997    ///                 sep = '|';
998    ///             } else {
999    ///                 sep = '/';
1000    ///             }
1001    ///         }
1002    ///         if dot {
1003    ///             write!(gt_str2, "{sep}.").unwrap();
1004    ///         } else {
1005    ///             write!(gt_str2, "{sep}{allele}").unwrap();
1006    ///         }
1007    ///     }
1008    ///     write!(gt_str2, "\n").unwrap();
1009    /// }
1010    /// let gt_str2 = String::from_utf8(gt_str2).unwrap();
1011    /// // compare bcftools results and bcf-reader results
1012    /// for (a, b) in gt_str
1013    ///     .split(|c| (c == '\n') || (c == '\t'))
1014    ///     .zip(gt_str2.split(|c| (c == '\n') || (c == '\t')))
1015    /// {
1016    ///     assert_eq!(a, b);
1017    /// }
1018    /// ```
1019    pub fn fmt_gt(&self, header: &Header) -> NumericValueIter<'_> {
1020        let mut it = NumericValueIter::default();
1021        match header.get_fmt_gt_id() {
1022            None => it,
1023            Some(fmt_gt_id) => {
1024                // find the right field for gt
1025                self.gt.iter().for_each(|e| {
1026                    if e.0 == fmt_gt_id {
1027                        it = iter_typed_integers(
1028                            e.1,
1029                            e.2 * self.n_sample as usize,
1030                            &self.buf_indiv[e.3.start..e.3.end],
1031                        );
1032                    }
1033                });
1034                it
1035            }
1036        }
1037    }
1038
1039    /// Returns an iterator over all values for a field in the record's FORMATs (indiv).
1040    ///
1041    /// Example:
1042    /// ```
1043    /// use bcf_reader::*;
1044    /// use std::io::Write;
1045    /// // read data generated by bcftools
1046    /// //  bcftools query -f '[\t%AD]\n' test.bcf | bgzip -c > test_ad.gz
1047    /// let mut ad_str = String::new();
1048    /// smart_reader("testdata/test_ad.gz")
1049    ///     .unwrap()
1050    ///     .read_to_string(&mut ad_str)
1051    ///     .unwrap();
1052    /// // read data via bcf-reader
1053    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1054    /// let s = read_header(&mut f).unwrap();
1055    /// let header = Header::from_string(&s).unwrap();
1056    /// let mut record = Record::default();
1057    /// let mut ad_str2 = Vec::<u8>::new();
1058    /// let ad_filed_key = header.get_idx_from_dictionary_str("FORMAT", "AD").unwrap();
1059    /// while let Ok(_) = record.read(&mut f) {
1060    ///     for (i, val) in record.fmt_field(ad_filed_key).enumerate() {
1061    ///         let val = val.unwrap();
1062    ///         if i % record.n_allele() as usize == 0 {
1063    ///             if ad_str2.last().map(|c| *c == b',') == Some(true) {
1064    ///                 ad_str2.pop(); // trim last allele separator
1065    ///             }
1066    ///             ad_str2.push(b'\t'); // sample separator
1067    ///         }
1068    ///         match val.int_val() {
1069    ///             None => {}
1070    ///             Some(ad) => {
1071    ///                 write!(ad_str2, "{ad},").unwrap(); // allele separator
1072    ///             }
1073    ///         }
1074    ///     }
1075    ///     // site separator
1076    ///     *ad_str2.last_mut().unwrap() = b'\n'; // sample separator
1077    /// }
1078    /// let ad_str2 = String::from_utf8(ad_str2).unwrap();
1079    /// // compare bcftools results and bcf-reader results
1080    /// for (a, b) in ad_str
1081    ///     .split(|c| (c == '\n') || (c == '\t'))
1082    ///     .zip(ad_str2.split(|c| (c == '\n') || (c == '\t')))
1083    /// {
1084    ///     assert_eq!(a, b);
1085    /// }
1086    /// ```
1087    pub fn fmt_field(&self, fmt_key: usize) -> NumericValueIter<'_> {
1088        // default iterator
1089        let mut it = NumericValueIter::default();
1090
1091        // find the right field for gt
1092        self.gt.iter().for_each(|e| {
1093            if e.0 == fmt_key {
1094                it = iter_typed_integers(
1095                    e.1,
1096                    e.2 * self.n_sample as usize,
1097                    &self.buf_indiv[e.3.start..e.3.end],
1098                );
1099            }
1100        });
1101        it
1102    }
1103
1104    /// get 0-based position (bp) value
1105    /// Example:
1106    /// ```
1107    /// use bcf_reader::*;
1108    /// // read data generated by bcftools
1109    /// // bcftools query -f '%POS\n' test.bcf | bgzip -c > test_pos.gz
1110    /// let mut pos_str = String::new();
1111    /// smart_reader("testdata/test_pos.gz")
1112    ///     .unwrap()
1113    ///     .read_to_string(&mut pos_str)
1114    ///     .unwrap();
1115    /// // read data via bcf-reader
1116    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1117    /// let _s = read_header(&mut f).unwrap();
1118    /// let mut record = Record::default();
1119    /// let mut pos_str2 = Vec::<u8>::new();
1120    /// use std::io::Write;
1121    /// while let Ok(_) = record.read(&mut f) {
1122    ///     write!(pos_str2, "{}\n", record.pos() + 1).unwrap();
1123    /// }
1124    /// let pos_str2 = String::from_utf8(pos_str2).unwrap();
1125    /// // compare bcftools results and bcf-reader results
1126    /// assert_eq!(pos_str, pos_str2);
1127    /// ```
1128    pub fn pos(&self) -> i32 {
1129        self.pos
1130    }
1131
1132    /// get variant ID as &str
1133    /// Example:
1134    /// ```
1135    /// use bcf_reader::*;
1136    /// // read data generated by bcftools
1137    /// // bcftools query -f '%ID\n' test.bcf | bgzip -c > test_id.gz
1138    /// let mut id_str = String::new();
1139    ///
1140    /// smart_reader("testdata/test_id.gz")
1141    ///     .unwrap()
1142    ///     .read_to_string(&mut id_str)
1143    ///     .unwrap();
1144    ///
1145    /// // read data via bcf-reader
1146    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1147    /// let _s = read_header(&mut f).unwrap();
1148    /// let mut record = Record::default();
1149    /// let mut id_str2 = Vec::<u8>::new();
1150    /// use std::io::Write;
1151    /// while let Ok(_) = record.read(&mut f) {
1152    ///     let record_id = record.id().unwrap();
1153    ///     let id = if record_id.is_empty() { "." } else { record_id };
1154    ///     write!(id_str2, "{}\n", id).unwrap();
1155    /// }
1156    /// let id_str2 = String::from_utf8(id_str2).unwrap();
1157    /// // compare bcftools results and bcf-reader results
1158    /// assert_eq!(id_str, id_str2);
1159    /// ```
1160    pub fn id(&self) -> Result<&str> {
1161        std::str::from_utf8(&self.buf_shared[self.id.start..self.id.end]).map_err(Error::Utf8Error)
1162    }
1163
1164    /// Returns the ranges of bytes in buf_shared for all alleles in the record.
1165    /// Example:
1166    /// ```
1167    /// use bcf_reader::*;
1168    /// // read data generated by bcftools
1169    /// // bcftools query -f '%REF,%ALT\n' test.bcf | bgzip -c > test_allele.gz
1170    /// let mut allele_str = String::new();
1171    /// smart_reader("testdata/test_allele.gz")
1172    ///     .unwrap()
1173    ///     .read_to_string(&mut allele_str)
1174    ///     .unwrap();
1175    /// // read data via bcf-reader
1176    /// let mut f = smart_reader("testdata/test.bcf").unwrap();
1177    /// let _s = read_header(&mut f).unwrap();
1178    /// let mut record = Record::default();
1179    /// let mut allele_str2 = Vec::<u8>::new();
1180    /// while let Ok(_) = record.read(&mut f) {
1181    ///     for rng in record.alleles().iter() {
1182    ///         let slice = &record.buf_shared()[rng.clone()];
1183    ///         allele_str2.extend(slice);
1184    ///         allele_str2.push(b',');
1185    ///     }
1186    ///     *allele_str2.last_mut().unwrap() = b'\n';
1187    /// }
1188    /// let allele_str2 = String::from_utf8(allele_str2).unwrap();
1189    /// // compare bcftools results and bcf-reader results
1190    /// assert_eq!(allele_str, allele_str2);
1191    /// ```
1192    pub fn alleles(&self) -> &[Range<usize>] {
1193        &self.alleles[..]
1194    }
1195
1196    /// Return an iterator of numeric values for an INFO/xxx field.
1197    /// If the key is not found, the returned iterator will have a zero length.
1198    ///
1199    /// Example:
1200    /// ```
1201    /// use bcf_reader::*;
1202    /// use std::io::Write;
1203    /// // read data generated by bcftools
1204    /// // bcftools query -f '%INFO/AF\n' testdata/test2.bcf | bgzip -c > testdata/test2_info_af.gz
1205    /// let mut info_af_str = String::new();
1206    /// smart_reader("testdata/test2_info_af.gz")
1207    ///     .unwrap()
1208    ///     .read_to_string(&mut info_af_str)
1209    ///     .unwrap();
1210    /// // read data via bcf-reader
1211    /// let mut f = smart_reader("testdata/test2.bcf").unwrap();
1212    /// let s = read_header(&mut f).unwrap();
1213    /// let header = Header::from_string(&s).unwrap();
1214    /// let mut record = Record::default();
1215    /// let mut info_af_str2 = Vec::<u8>::new();
1216    /// let info_af_key = header.get_idx_from_dictionary_str("INFO", "AF").unwrap();
1217    /// while let Ok(_) = record.read(&mut f) {
1218    ///     record.info_field_numeric(info_af_key).for_each(|nv| {
1219    ///         let af = nv.unwrap().float_val().unwrap();
1220    ///         write!(info_af_str2, "{af},").unwrap();
1221    ///     });
1222    ///     *info_af_str2.last_mut().unwrap() = b'\n'; // line separators
1223    /// }
1224    /// let filter_str2 = String::from_utf8(info_af_str2).unwrap();
1225    /// assert_eq!(info_af_str, filter_str2);
1226    /// ```
1227    pub fn info_field_numeric(&self, info_key: usize) -> NumericValueIter {
1228        // default
1229        let mut it = NumericValueIter {
1230            reader: std::io::Cursor::new(&[0u8; 0]),
1231            typ: 0,
1232            len: 0,
1233            cur: 0,
1234        };
1235        for (key, typ, n, rng) in self.info.iter() {
1236            if *key == info_key {
1237                it = NumericValueIter {
1238                    reader: std::io::Cursor::new(&self.buf_shared[rng.start..rng.end]),
1239                    typ: *typ,
1240                    len: *n,
1241                    cur: 0,
1242                };
1243                break;
1244            }
1245        }
1246        it
1247    }
1248
1249    /// Return str value for an INFO/xxx field.
1250    /// If the key is not found or data type is not string, then return None.
1251    pub fn info_field_str(&self, info_key: usize) -> Result<Option<&str>> {
1252        let mut res = None;
1253        for (key, typ, _n, rng) in self.info.iter() {
1254            if *key == info_key {
1255                if *typ != 0x7 {
1256                    return Ok(None);
1257                }
1258                let s = std::str::from_utf8(&self.buf_shared[rng.start..rng.end])
1259                    .map_err(Error::Utf8Error)?;
1260                res = Some(s);
1261                break;
1262            }
1263        }
1264        Ok(res)
1265    }
1266
1267    /// iterate an integer for each filter key.
1268    /// If the length of the iterator is 0, it means no filter label is set.
1269    /// Example:
1270    /// ```
1271    /// use bcf_reader::*;
1272    /// use std::io::Write;
1273    /// // read data generated by bcftools
1274    /// // bcftools query -f '%FILTER\n' testdata/test2.bcf | bgzip -c > testdata/test2_filters.gz
1275    /// let mut filter_str = String::new();
1276    /// smart_reader("testdata/test2_filters.gz")
1277    ///     .unwrap()
1278    ///     .read_to_string(&mut filter_str)
1279    ///     .unwrap();
1280    /// // read data via bcf-reader
1281    /// let mut f = smart_reader("testdata/test2.bcf").unwrap();
1282    /// let s = read_header(&mut f).unwrap();
1283    /// let header = Header::from_string(&s).unwrap();
1284    /// let mut record = Record::default();
1285    /// let mut filter_str2 = Vec::<u8>::new();
1286    /// let d = header.dict_strings();
1287    /// while let Ok(_) = record.read(&mut f) {
1288    ///     record.filters().for_each(|nv| {
1289    ///         let nv = nv.unwrap();
1290    ///         let filter_key = nv.int_val().unwrap() as usize;
1291    ///         let dict_string_map = &d[&filter_key];
1292    ///         let filter_name = &dict_string_map["ID"];
1293    ///         write!(filter_str2, "{filter_name};").unwrap();
1294    ///     });
1295    ///     *filter_str2.last_mut().unwrap() = b'\n'; // line separators
1296    /// }
1297    /// let filter_str2 = String::from_utf8(filter_str2).unwrap();
1298    /// // compare bcftools results and bcf-reader results
1299    /// assert_eq!(filter_str, filter_str2);
1300    /// ```
1301    pub fn filters(&self) -> NumericValueIter {
1302        let (typ, n, rng) = &self.filters;
1303        NumericValueIter {
1304            reader: std::io::Cursor::new(&self.buf_shared[rng.start..rng.end]),
1305            typ: *typ,
1306            len: *n,
1307            cur: 0,
1308        }
1309    }
1310
1311    /// Returns the buffer containing indv (sample-level) information
1312    pub fn buf_indiv(&self) -> &[u8] {
1313        &self.buf_indiv[..]
1314    }
1315
1316    /// Returns the buffer containing the shared (site-level) information
1317    pub fn buf_shared(&self) -> &[u8] {
1318        &self.buf_shared[..]
1319    }
1320}
1321
1322/// Open a file from a path as a MultiGzDecoder or a BufReader depending on
1323/// whether the file has the magic number for gzip (0x1f and 0x8b)
1324pub fn smart_reader(p: impl AsRef<std::path::Path>) -> std::io::Result<Box<dyn std::io::Read>> {
1325    let mut f = std::fs::File::open(p.as_ref())?;
1326    if (f.read_u8()? == 0x1fu8) && (f.read_u8()? == 0x8bu8)
1327    //.expect("can not read second byte")
1328    {
1329        // gzip format
1330        f.rewind()?;
1331        Ok(Box::new(flate2::read::MultiGzDecoder::new(f)))
1332    } else {
1333        // not gzip format
1334        f.rewind()?;
1335        Ok(Box::new(std::io::BufReader::new(f)))
1336    }
1337}
1338
1339/// This reader facilitates parallel decompression of BCF data compressed in
1340/// the BGZF format—a specialized version of the multi-member gzip file format.
1341/// It utilizes internal buffers to sequentially ingest compressed data from
1342/// various gzip blocks, leveraging the `rayon` crate to achieve concurrent
1343/// decompression. This design addresses the potential bottleneck in data
1344/// processing speed that occurs when decompression is not executed in parallel,
1345/// ensuring more efficient handling of compressed data streams.
1346/// Example:
1347/// ```
1348/// use bcf_reader::*;
1349/// use std::fs::File;
1350/// use std::io::BufReader;
1351/// use std::io::Write;
1352/// // read data generated by bcftools
1353/// // bcftools query -f '[\t%GT]\n' test.bcf | bgzip -c > test_gt.gz
1354/// let mut gt_str = String::new();
1355/// smart_reader("testdata/test_gt.gz")
1356///     .unwrap()
1357///     .read_to_string(&mut gt_str)
1358///     .unwrap();
1359/// // read data via bcf-reader
1360/// let max_gzip_block_in_buffer = 10;
1361/// let reader = File::open("testdata/test.bcf").map(BufReader::new).unwrap();
1362/// let mut f =
1363///     ParMultiGzipReader::from_reader(reader, max_gzip_block_in_buffer, None, None).unwrap();
1364/// let s = read_header(&mut f).unwrap();
1365/// let header = Header::from_string(&s).unwrap();
1366/// let mut record = Record::default();
1367/// let mut gt_str2 = Vec::<u8>::new();
1368/// while let Ok(_) = record.read(&mut f) {
1369///     for (i, bn) in record.fmt_gt(&header).enumerate() {
1370///         let bn = bn.unwrap();
1371///         let (noploidy, dot, phased, allele) = bn.gt_val();
1372///         assert_eq!(noploidy, false); // missing ploidy
1373///         let mut sep = '\t';
1374///         if i % 2 == 1 {
1375///             if phased {
1376///                 sep = '|';
1377///             } else {
1378///                 sep = '/';
1379///             }
1380///         }
1381///         if dot {
1382///             write!(gt_str2, "{sep}.").unwrap();
1383///         } else {
1384///             write!(gt_str2, "{sep}{allele}").unwrap();
1385///         }
1386///     }
1387///     write!(gt_str2, "\n").unwrap();
1388/// }
1389/// let gt_str2 = String::from_utf8(gt_str2).unwrap();
1390/// // compare bcftools results and bcf-reader results
1391/// for (a, b) in gt_str
1392///     .split(|c| (c == '\n') || (c == '\t'))
1393///     .zip(gt_str2.split(|c| (c == '\n') || (c == '\t')))
1394/// {
1395///     assert_eq!(a, b);
1396/// }
1397/// ```
1398///
1399/// See [`ParMultiGzipReader::from_reader`] for an example to jump to a target
1400/// genome interval.
1401pub struct ParMultiGzipReader<R>
1402where
1403    R: Read,
1404{
1405    inner: R,
1406    // vector of pairs of vector; For each pair one is for compressed data, the other for uncompressed
1407    buffer: Vec<BgzfBuffer>,
1408    ngzip: usize, // number gzip blocks read into buffer
1409    igzip: usize, // the current block to be consumed
1410    ibyte: usize, // the current byte to be consumed
1411    coffset: u64,
1412    inner_eof: bool,
1413}
1414
1415#[derive(Default, Clone)]
1416struct BgzfBuffer {
1417    compressed: Vec<u8>,
1418    uncompressed: Vec<u8>,
1419    coffset: u64, // inclusive
1420    gzip_size: u16,
1421    uncompressed_data_size: u32,
1422}
1423
1424impl<R> ParMultiGzipReader<R>
1425where
1426    R: Read,
1427{
1428    /// Constructs a new `ParMultiGzipReader` by specifying the `ngzip_max` parameter,
1429    /// which defines the maximum number of gzip blocks that the internal buffers can
1430    /// handle simultaneously. This parameter should ideally be set to the number of
1431    /// CPU cores available to optimize parallel decompression performance, thereby
1432    /// leveraging the hardware's concurrency capabilities.
1433    ///
1434    /// The `coffset` parameter indicates the offset to the first byte of a
1435    /// target gzip block of the input reader.  The input reader should point to
1436    /// the start byte of a gzip block as indicated by `coffset` before passing
1437    /// the reader to `ParMultiGzipReader::from_reader`; otherwise,
1438    /// [`Seek::seek`] should be used on the input reader to adjust the position
1439    /// accordingly. Note that `ParMultiGzipReader` does not call [`Seek::seek`]
1440    /// on the reader.
1441    ///
1442    /// The `uoffset` parameter specifies the number of bytes to skip within the
1443    /// first decompressed gzip data. Since skipping within uncompressed data
1444    /// requires decompression, this offset is applied within the
1445    /// `ParMultiGzipReader::from_reader` method.
1446    ///
1447    /// # Examples
1448    /// ```
1449    /// use bcf_reader::*;
1450    /// use std::{
1451    ///     fs::File,
1452    ///     io::{BufReader, Seek},
1453    /// };
1454    /// // index file
1455    /// let csi = Csi::from_path("testdata/test3.bcf.csi").unwrap();
1456    /// // reader
1457    /// let mut reader = File::open("testdata/test3.bcf")
1458    ///     .map(BufReader::new)
1459    ///     .unwrap();
1460    ///
1461    /// // calculate first offsets of the target postion
1462    /// let start = 1495403 - 1;
1463    /// let end = 1495746 - 1;
1464    /// let chrom_id = 0;
1465    /// let bin_id = csi.get_bin_id(start, start + 1 as i64);
1466    /// let bin_details = csi.get_bin_details(chrom_id, bin_id).unwrap();
1467    /// let (coffset, uoffset) = bin_details.chunks()[0].chunk_beg.get_coffset_uoffset();
1468    ///
1469    /// // seek to the target bgzip block
1470    /// reader.seek(std::io::SeekFrom::Start(coffset)).unwrap();
1471    /// // create the parallelizable reader by wraping around the existing reader
1472    /// // and specifing offsets
1473    /// let mut reader =
1474    ///     ParMultiGzipReader::from_reader(reader, 1, Some(coffset), Some(uoffset)).unwrap();
1475    ///
1476    /// let mut record = Record::default();
1477    /// let mut pos_found = vec![];
1478    /// while let Ok(_) = record.read(&mut reader) {
1479    ///     let pos = record.pos() as i64;
1480    ///     // the bin containing the start position of target interval may have records
1481    ///     // before the start position, so skip them.
1482    ///     if pos < start {
1483    ///         continue;
1484    ///     }
1485    ///     // read the record until out of the target interval
1486    ///     else if pos >= end {
1487    ///         break;
1488    ///     }
1489    ///     pos_found.push(pos);
1490    /// }
1491    /// assert_eq!(pos_found, vec![start]);
1492    /// ```
1493    ///
1494    /// # Parameters
1495    /// - `reader`: The input reader from which gzip blocks will be read.
1496    /// - `ngzip_max`: The maximum number of gzip blocks that can be processed in parallel.
1497    /// - `coffset`: An optional offset to the start of a gzip block in the input reader.
1498    /// - `uoffset`: An optional offset within the first block of decompressed data.
1499    ///
1500    /// # Returns
1501    /// Returns a new instance of `ParMultiGzipReader`.
1502    pub fn from_reader(
1503        reader: R,
1504        ngzip_max: usize,
1505        coffset: Option<u64>,
1506        uoffset: Option<u64>,
1507    ) -> Result<Self> {
1508        let mut this = Self {
1509            inner: reader,
1510            buffer: vec![BgzfBuffer::default(); ngzip_max],
1511            ngzip: 0,
1512            igzip: 0,
1513            ibyte: 0,
1514            coffset: coffset.unwrap_or(0),
1515            inner_eof: false,
1516        };
1517        this.clear_and_fill_buffers()?;
1518        this.decomp_all()?;
1519        this.ibyte = uoffset.unwrap_or(0) as usize;
1520        Ok(this)
1521    }
1522    pub fn get_coffset_uoffset(&self) -> (u64, u64) {
1523        let coffset = self.buffer[self.igzip].coffset;
1524        let uoffset = self.ibyte as u64;
1525        (coffset, uoffset)
1526    }
1527    fn read_single_gzip(&mut self) -> io::Result<()> {
1528        let this_buffer_offset = if self.ngzip == 0 {
1529            self.coffset
1530        } else {
1531            let prev_buffer = &self.buffer[self.ngzip - 1];
1532            prev_buffer.coffset + prev_buffer.gzip_size as u64
1533        };
1534        let this_buffer = &mut self.buffer[self.ngzip];
1535
1536        // dbg!(this_buffer_offset);
1537
1538        // let coffset_beg = self.inner.stream_position().unwrap();
1539        // dbg!(coffset_beg);
1540        let id1 = match self.inner.read_u8() {
1541            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
1542                self.inner_eof = true;
1543                return Ok(());
1544            }
1545            Err(e) => {
1546                return Err(e);
1547            }
1548            Ok(id1) => id1,
1549        };
1550        if id1 != 31 {
1551            Err(ParseGzipHeaderError {
1552                key: "id1",
1553                expected: 31,
1554                found: id1 as usize,
1555            })?;
1556        }
1557
1558        let id2 = self.inner.read_u8()?;
1559        if id2 != 139 {
1560            Err(ParseGzipHeaderError {
1561                key: "id2",
1562                expected: 139,
1563                found: id2 as usize,
1564            })?;
1565        }
1566        // assert_eq!(id2, 139);
1567        let cm = self.inner.read_u8()?;
1568        if cm != 8 {
1569            Err(ParseGzipHeaderError {
1570                key: "cm",
1571                expected: 8,
1572                found: cm as usize,
1573            })?;
1574        }
1575        // assert_eq!(cm, 8);
1576        let flg = self.inner.read_u8()?;
1577        if flg != 4 {
1578            Err(ParseGzipHeaderError {
1579                key: "flg",
1580                expected: 4,
1581                found: flg as usize,
1582            })?;
1583        }
1584        // assert_eq!(flg, 4);
1585        let _mtime = self.inner.read_u32::<LittleEndian>()?;
1586        let _xfl = self.inner.read_u8()?;
1587        let _os = self.inner.read_u8()?;
1588        let xlen = self.inner.read_u16::<LittleEndian>()?;
1589        let si1 = self.inner.read_u8()?;
1590        assert_eq!(si1, 66);
1591        let si2 = self.inner.read_u8()?;
1592        assert_eq!(si2, 67);
1593        let slen = self.inner.read_u16::<LittleEndian>()?;
1594        assert_eq!(slen, 2);
1595        let bsize = self.inner.read_u16::<LittleEndian>()?;
1596
1597        let buffer_compressed = &mut this_buffer.compressed;
1598        let cdata_sz = bsize - xlen - 19;
1599
1600        buffer_compressed.resize(cdata_sz as usize, 0u8);
1601        self.inner.read_exact(buffer_compressed.as_mut_slice())?;
1602
1603        let _crc32 = self.inner.read_u32::<LittleEndian>()?;
1604        let isize = self.inner.read_u32::<LittleEndian>()?;
1605
1606        let buffer_uncompressed = &mut this_buffer.uncompressed;
1607        buffer_uncompressed.resize(isize as usize, 0u8);
1608        this_buffer.coffset = this_buffer_offset;
1609        this_buffer.gzip_size = bsize + 1;
1610        this_buffer.uncompressed_data_size = isize;
1611
1612        // increment counter
1613        self.ngzip += 1;
1614
1615        Ok(())
1616    }
1617    // clear buffer and refill (sequential)
1618    fn clear_and_fill_buffers(&mut self) -> std::io::Result<()> {
1619        let Self {
1620            inner: _,
1621            buffer,
1622            coffset,
1623            ngzip,
1624            igzip,
1625            ibyte,
1626            inner_eof: _,
1627        } = self;
1628
1629        // update coffset for the buffer vector based on last used buffer
1630        if *ngzip > 0 {
1631            let last_buffer = &buffer[*ngzip - 1];
1632            *coffset = last_buffer.coffset + last_buffer.gzip_size as u64;
1633        }
1634
1635        buffer.iter_mut().for_each(|bgzf_buffer| {
1636            bgzf_buffer.compressed.clear();
1637            bgzf_buffer.uncompressed.clear();
1638            bgzf_buffer.coffset = 0;
1639            bgzf_buffer.gzip_size = 0;
1640            bgzf_buffer.uncompressed_data_size = 0;
1641        });
1642        *ngzip = 0;
1643        *igzip = 0;
1644        *ibyte = 0;
1645        for _i in 0..self.buffer.len() {
1646            self.read_single_gzip()?;
1647            if self.inner_eof {
1648                break;
1649            }
1650        }
1651        Ok(())
1652    }
1653
1654    /// decompress all read gzip file in memory (parallel)
1655    fn decomp_all(&mut self) -> std::io::Result<()> {
1656        self.buffer
1657            .par_iter_mut()
1658            .try_for_each(|buffer| -> std::io::Result<()> {
1659                let compressed = buffer.compressed.as_slice();
1660                let uncompressed = &mut buffer.uncompressed.as_mut_slice();
1661                let mut deflater = DeflateDecoder::new(compressed);
1662                deflater.read_exact(uncompressed)?;
1663                //.map_err(|e| e.add_context("deflater.read_exact"))?;
1664                Ok(())
1665            })?;
1666        Ok(())
1667    }
1668}
1669
1670impl<R> Read for ParMultiGzipReader<R>
1671where
1672    R: Read,
1673{
1674    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1675        // no more data in the buffer
1676        if self.ngzip == self.igzip {
1677            // no more data to read from file
1678            if self.inner_eof {
1679                return Ok(0);
1680            }
1681            self.clear_and_fill_buffers()?;
1682            self.decomp_all()?;
1683        }
1684        // read from the buffer
1685        //  check current
1686        let uncompressed = self.buffer[self.igzip].uncompressed.as_slice();
1687        let mut n = uncompressed.len() - self.ibyte;
1688        if n > buf.len() {
1689            n = buf.len();
1690        }
1691        // copy data
1692        buf[0..n]
1693            .iter_mut()
1694            .zip(uncompressed[self.ibyte..].iter())
1695            .for_each(|(d, s)| *d = *s);
1696        self.ibyte += n;
1697        if self.ibyte == uncompressed.len() {
1698            self.igzip += 1;
1699            self.ibyte = 0;
1700            if self.ngzip == self.igzip {
1701                // no more data to read from file
1702                if self.inner_eof {
1703                    return Ok(0);
1704                }
1705                self.clear_and_fill_buffers()?;
1706                self.decomp_all()?;
1707            }
1708        }
1709        Ok(n)
1710    }
1711}
1712
1713/// Virutal File offset used to jump to specific indexed bin within BCF-format
1714/// genotype data separated into BGZF blocks
1715#[derive(Default)]
1716pub struct VirtualFileOffsets(u64);
1717
1718impl VirtualFileOffsets {
1719    /// Get the `coffset` and `uoffset` tuple from the virutalfileoffset
1720    pub fn get_coffset_uoffset(&self) -> (u64, u64) {
1721        (self.0 >> 16, self.0 & 0xffff)
1722    }
1723}
1724
1725impl From<u64> for VirtualFileOffsets {
1726    /// Convert u64 into `VirtualFileOffsets`
1727    fn from(value: u64) -> Self {
1728        VirtualFileOffsets(value)
1729    }
1730}
1731impl Debug for VirtualFileOffsets {
1732    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1733        let (coffset, uoffset) = self.get_coffset_uoffset();
1734        write!(f, "coffset: {}, uoffset: {}", coffset, uoffset)
1735    }
1736}
1737
1738#[derive(Default, Debug)]
1739struct CsiIndex {
1740    n_bin: i32,
1741    bins: Vec<CsiBin>,
1742}
1743
1744/// A chunk within a bin in the  CSI data structure
1745#[derive(Default, Debug)]
1746pub struct CsiChunk {
1747    pub chunk_beg: VirtualFileOffsets,
1748    pub chunk_end: VirtualFileOffsets,
1749}
1750
1751/// A bin in the CSI data structure
1752#[derive(Default, Debug)]
1753pub struct CsiBin {
1754    bin: u32,
1755    _loffset: VirtualFileOffsets,
1756    n_chunk: i32,
1757    chunks: Vec<CsiChunk>,
1758}
1759
1760impl CsiBin {
1761    /// return a slice of chunks with a bin in the CSI data structure
1762    pub fn chunks(&self) -> &[CsiChunk] {
1763        &self.chunks[..]
1764    }
1765}
1766
1767/// A struct representing CSI index file content
1768#[derive(Default, Debug)]
1769pub struct Csi {
1770    magic: [u8; 4],
1771    min_shift: i32,
1772    depth: i32,
1773    l_aux: i32,
1774    aux: Vec<u8>,
1775    n_ref: i32,
1776    indices: Vec<CsiIndex>,
1777    n_no_coor: Option<u64>,
1778}
1779
1780impl Csi {
1781    /// Create Csi from a path to a `*.csi` file
1782    pub fn from_path(p: impl AsRef<Path>) -> Result<Self> {
1783        let mut csi = Csi::default();
1784        let mut file = smart_reader(p.as_ref())?;
1785        // magic
1786        file.read_exact(csi.magic.as_mut())
1787            .map_err(|e| e.add_context("error in reading csi magic bytes"))?;
1788        if csi.magic != [b'C', b'S', b'I', 1] {
1789            return Err(Error::Io(std::io::Error::new(
1790                io::ErrorKind::Other,
1791                "csi magic is not 'CSI1'".to_owned(),
1792            )));
1793        }
1794        // min_shift
1795        csi.min_shift = file
1796            .read_i32::<LittleEndian>()
1797            .map_err(|e| e.add_context("error in reading csi min_shift field"))?;
1798        // dbg!(csi.min_shift);
1799        // depth
1800        csi.depth = file
1801            .read_i32::<LittleEndian>()
1802            .map_err(|e| e.add_context("error in reading csi depth field"))?;
1803        // dbg!(csi.depth);
1804        // l_aux
1805        csi.l_aux = file
1806            .read_i32::<LittleEndian>()
1807            .map_err(|e| e.add_context("error in reading csi l_aux field"))?;
1808        // dbg!(csi.l_aux);
1809        // aux
1810        csi.aux.resize(csi.l_aux as usize, 0u8);
1811        file.read_exact(csi.aux.as_mut())
1812            .map_err(|e| e.add_context("error in reading csi aux field"))?;
1813        // n_ref
1814        csi.n_ref = file
1815            .read_i32::<LittleEndian>()
1816            .map_err(|e| e.add_context("error in reading csi n_ref field"))?;
1817
1818        // iterate over chromosomes
1819        for _ in 0..csi.n_ref {
1820            let mut idx = CsiIndex {
1821                n_bin: {
1822                    file.read_i32::<LittleEndian>()
1823                        .map_err(|e| e.add_context("error in reading csi index n_bin field"))?
1824                },
1825                ..Default::default()
1826            };
1827            for _ in 0..idx.n_bin {
1828                let mut bin = CsiBin {
1829                    // bin
1830                    bin: file
1831                        .read_u32::<LittleEndian>()
1832                        .map_err(|e| e.add_context("error in reading csi bin bin field"))?,
1833                    _loffset: file
1834                        .read_u64::<LittleEndian>()
1835                        .map_err(|e| {
1836                            Error::from(std::io::Error::new(
1837                                e.kind(),
1838                                "error in reading csi bin loffset field",
1839                            ))
1840                        })?
1841                        .into(),
1842                    // n_chunk
1843                    n_chunk: file.read_i32::<LittleEndian>().map_err(|e| {
1844                        Error::from(std::io::Error::new(
1845                            e.kind(),
1846                            "error in reading csi bin n_chunk field",
1847                        ))
1848                    })?,
1849                    ..Default::default()
1850                };
1851
1852                for _ in 0..bin.n_chunk {
1853                    let chunk = CsiChunk {
1854                        chunk_beg: file
1855                            .read_u64::<LittleEndian>()
1856                            .map_err(|e| {
1857                                Error::from(std::io::Error::new(
1858                                    e.kind(),
1859                                    "error in reading csi chunk chunk_beg",
1860                                ))
1861                            })?
1862                            .into(),
1863                        chunk_end: file
1864                            .read_u64::<LittleEndian>()
1865                            .map_err(|e| {
1866                                Error::from(std::io::Error::new(
1867                                    e.kind(),
1868                                    "error in reading csi chunk chunk_end",
1869                                ))
1870                            })?
1871                            .into(),
1872                    };
1873                    bin.chunks.push(chunk);
1874                }
1875                idx.bins.push(bin);
1876            }
1877            idx.bins.sort_by_key(|x| x.bin);
1878            csi.indices.push(idx);
1879        }
1880        // n_no_coor
1881        csi.n_no_coor = file.read_u64::<LittleEndian>().ok();
1882
1883        Ok(csi)
1884    }
1885
1886    /// Convert positional coordinate range to a bin number
1887    ///
1888    /// `beg`, `end`` coordinates are 0-based. It is exclusive for end.
1889    /// For some reason, not all length bin are searchable.
1890    /// It is seems that setting `end` to `beg` + 1 works well.
1891    // no sure how it works, but it is from CSIv1.pdf c code
1892    pub fn get_bin_id(&self, beg: i64, end: i64) -> u32 {
1893        let mut l = self.depth;
1894        let end = end - 1;
1895        let mut s = self.min_shift;
1896        let mut t = ((1 << (self.depth * 3)) - 1) / 7;
1897
1898        while l > 0 {
1899            // dbg!(s);
1900            if (beg >> s) == (end >> s) {
1901                return (t + (beg >> s)) as u32;
1902            }
1903            s += 3;
1904            t += 1 << (l * 3);
1905            l += 1;
1906        }
1907
1908        0
1909    }
1910
1911    /// Get CsiBin based the chromosome id and bin number.
1912    ///
1913    /// The return CsiBin can provide details of the included chunks.
1914    pub fn get_bin_details(&self, seqid: usize, bin_id: u32) -> Result<&CsiBin> {
1915        // assert!(bin_id <= self.get_bin_limit());
1916        let bins = &self.indices[seqid].bins;
1917        // dbg!(self.indices.len());
1918        // dbg!(bins.len());
1919        let i = bins
1920            .as_slice()
1921            .binary_search_by(|x| x.bin.cmp(&bin_id))
1922            .map_err(|_| Error::CsBinIndexNotFound)?;
1923        // .expect("bin index not found\n");
1924        Ok(&bins[i])
1925    }
1926
1927    /// Get the max possible bin number in theory. Note, the maximum bin may not
1928    /// be present in the Csi index file.
1929    pub fn get_bin_limit(&self) -> u32 {
1930        (1 << (((self.depth + 1) * 3) - 1)) / 7
1931    }
1932}
1933
1934/// BcfReader suitable for read through the BCF file.
1935///
1936/// This assumes that the source reader is in BCF format and stored as BGZF blocks.
1937///
1938/// # Example
1939/// ```
1940/// use bcf_reader::*;
1941/// use std::{
1942///     fs::File,
1943///     io::{BufReader, Seek},
1944/// };
1945/// // underlying reader can be stdin or a file
1946/// let reader = std::fs::File::open("testdata/test3.bcf")
1947///     .map(BufReader::new)
1948///     .unwrap();
1949/// // 1. for sequential decompression (works for data from stdin or a file)
1950/// let reader = flate2::bufread::MultiGzDecoder::new(reader);
1951/// // 2. for parallel decompression (works for data from stdin or a file)
1952/// // however, when data is from stdin, jump is not supported for now.
1953/// // let reader = ParMultiGzipReader::from_reader(reader, 3, None, None);
1954///
1955/// // create bcf reader
1956/// let mut reader = BcfReader::from_reader(reader);
1957/// // read though header
1958/// let _header = reader.read_header();
1959/// // create a reusable record
1960/// let mut record = Record::default();
1961///
1962/// let mut pos_found = vec![];
1963/// // iterate records from the targeted genome interval
1964/// while let Ok(_) = reader.read_record(&mut record) {
1965///     pos_found.push(record.pos() + 1);
1966/// }
1967///
1968/// assert_eq!(pos_found[0], 72);
1969/// assert_eq!(*pos_found.last().unwrap(), 1498841);
1970/// ```
1971pub struct BcfReader<R>
1972where
1973    R: Read,
1974{
1975    inner: R,
1976    header_parsed: bool,
1977}
1978
1979impl<R> BcfReader<R>
1980where
1981    R: Read,
1982{
1983    /// `max_gzip`, the number of gzip blocks to read before batch
1984    /// decompression.  See `ParMultiGzipReader::from_reader` (by default or
1985    /// None, 3 gzip buffers will be used.)
1986    pub fn from_reader(reader: R) -> Self {
1987        Self {
1988            inner: reader,
1989            header_parsed: false,
1990        }
1991    }
1992
1993    /// Read the header
1994    pub fn read_header(&mut self) -> Result<Header> {
1995        let header =
1996            Header::from_string(&read_header(&mut self.inner)?).map_err(Error::ParseHeaderError)?;
1997        self.header_parsed = true;
1998        Ok(header)
1999    }
2000
2001    /// Read one record. This should be called after the header is read and parsed.
2002    /// Otherwise, it will panic.
2003    pub fn read_record(&mut self, record: &mut Record) -> Result<()> {
2004        assert!(
2005            self.header_parsed,
2006            "header should be parsed before reading records"
2007        );
2008        record.read(&mut self.inner)
2009    }
2010}
2011
2012/// A genome interval defined by chromosome id, start, and end positions
2013pub struct GenomeInterval {
2014    pub chrom_id: usize,
2015    pub start: i64,
2016    pub end: Option<i64>,
2017}
2018
2019/// IndexedBcfReader allows random access to a specific genome interval of the
2020/// BCF file using a CSI index file. It is an wrapper around
2021/// [`ParMultiGzipReader<BufReader<File>>`] to allow parallelizable bgzip
2022/// decompression.
2023///
2024/// The source should be BCF file consisting of small BGZF blocks.
2025///
2026/// # Example
2027///
2028/// ```
2029/// use bcf_reader::*;
2030/// // create indexed bcf reader
2031/// let mut reader =
2032///     IndexedBcfReader::from_path("testdata/test3.bcf", "testdata/test3.bcf.csi", None).unwrap();
2033/// // read though header
2034/// let _header = reader.read_header();
2035/// // define targeted genome interval
2036/// let interval = GenomeInterval {
2037///     chrom_id: 0,
2038///     start: 1489230 - 1,
2039///     end: Some(1498509 - 1),
2040/// };
2041/// // set interval
2042/// reader.set_interval(interval);
2043/// // create a reusable record
2044/// let mut record = Record::default();
2045///
2046/// let mut pos_found = vec![];
2047/// // iterate records from the targeted genome interval
2048/// while let Ok(_) = reader.read_record(&mut record) {
2049///     pos_found.push(record.pos() + 1);
2050/// }
2051///
2052/// assert_eq!(
2053///     pos_found,
2054///     vec![
2055///         1489230, 1489979, 1490069, 1490311, 1492233, 1492337, 1493505, 1494178, 1495207,
2056///         1495403, 1495746, 1496047, 1497964, 1498188,
2057///     ]
2058/// )
2059/// ```
2060pub struct IndexedBcfReader {
2061    inner: ParMultiGzipReader<BufReader<File>>,
2062    csi: Csi,
2063    header_parsed: bool,
2064    genome_interval: Option<GenomeInterval>,
2065}
2066
2067impl IndexedBcfReader {
2068    /// Create an IndexedBcfReader from paths to a bcf file and a corresponding
2069    /// csi index file.
2070    ///
2071    ///  - `max_gzip`, the number of gzip blocks to read before each batch
2072    ///    parallelized decompression. See [`ParMultiGzipReader::from_reader`]
2073    ///    (by default (None) use 3); this construct will automaticall read and
2074    ///    parse the header
2075    pub fn from_path(
2076        path_bcf: impl AsRef<Path>,
2077        path_csi: impl AsRef<Path>,
2078        max_gzip: Option<usize>,
2079    ) -> Result<Self> {
2080        let reader = File::open(path_bcf.as_ref()).map(BufReader::new)?;
2081        let csi = Csi::from_path(path_csi.as_ref())?;
2082        let reader = ParMultiGzipReader::from_reader(reader, max_gzip.unwrap_or(3), None, None)?;
2083        Ok(Self {
2084            inner: reader,
2085            csi,
2086            header_parsed: false,
2087            genome_interval: None,
2088        })
2089    }
2090    /// Read the header bytes, parse them and return a `Header`
2091    pub fn read_header(&mut self) -> Result<Header> {
2092        let header =
2093            Header::from_string(&read_header(&mut self.inner)?).map_err(Error::ParseHeaderError)?;
2094        self.header_parsed = true;
2095        Ok(header)
2096    }
2097
2098    /// Jump the file pointer to the begining to the targeted genome interval
2099    ///
2100    /// If no site within the genome interval, read_record will return Err(_)
2101    pub fn set_interval(&mut self, genome_interval: GenomeInterval) -> Result<()> {
2102        // find the target based on csi
2103        let bin_id = self
2104            .csi
2105            .get_bin_id(genome_interval.start, genome_interval.start + 1);
2106
2107        let (coffset, uoffset) = self
2108            .csi
2109            .get_bin_details(genome_interval.chrom_id, bin_id)?
2110            .chunks[0]
2111            .chunk_beg
2112            .get_coffset_uoffset();
2113
2114        let par_reader = &mut self.inner;
2115        par_reader.inner.seek(io::SeekFrom::Start(coffset))?;
2116
2117        // clear buffer, especially things related to coffset
2118        par_reader.buffer.iter_mut().for_each(|bgzf_buffer| {
2119            bgzf_buffer.compressed.clear();
2120            bgzf_buffer.uncompressed.clear();
2121            bgzf_buffer.coffset = 0;
2122            bgzf_buffer.gzip_size = 0;
2123            bgzf_buffer.uncompressed_data_size = 0;
2124        });
2125        par_reader.ngzip = 0;
2126        par_reader.igzip = 0;
2127        par_reader.ibyte = 0;
2128
2129        // fill buffer
2130        par_reader.clear_and_fill_buffers()?;
2131        par_reader.decomp_all()?;
2132
2133        // jump for uoffset
2134        par_reader.ibyte = uoffset as usize;
2135
2136        self.genome_interval = Some(genome_interval);
2137        Ok(())
2138    }
2139
2140    /// Read one record. Should be called after header is parsed.
2141    ///
2142    /// If `set_interval` has been called, only records within the given interval
2143    /// will be read.
2144    pub fn read_record(&mut self, record: &mut Record) -> Result<()> {
2145        if !self.header_parsed {
2146            return Err(Error::HeaderNotParsed);
2147        }
2148
2149        let interval = self
2150            .genome_interval
2151            .as_ref()
2152            .ok_or(Error::IndexBcfReaderMissingGenomeInterval)?;
2153        let start = interval.start;
2154        let end = interval.end;
2155        loop {
2156            match record.read(&mut self.inner) {
2157                Ok(_) => {
2158                    if let Some(end) = end {
2159                        if record.pos as i64 >= end {
2160                            let e =
2161                                std::io::Error::new(std::io::ErrorKind::NotFound, "out of range");
2162                            Err(e)?;
2163                        }
2164                    }
2165                    if record.pos as i64 >= start {
2166                        return Ok(());
2167                    }
2168                }
2169                Err(e) => return Err(e),
2170            }
2171        }
2172    }
2173}
bcf_reader/lib.rs

bcf_reader/
lib.rs