1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#![warn(missing_docs)]

//! **noodles-bgzf** handles the reading and writing of the blocked gzip format (BGZF).
//!
//! While the gzip format is typically a single stream, a BGZF is the concatenation of many gzip
//! streams. Each stream is called a block, with its uncompressed data size being constrained to
//! less than 64 KiB. This multistream gzip allows random access using [`virtual positions`].
//!
//! noodles-bgzf abstracts away the concept of blocks, implementing [`std::io::Read`] for the
//! reader and [`std::io::Write`] for the writer.
//!
//! [`virtual positions`]: VirtualPosition
//!
//! # Examples
//!
//! ## Read an entire BGZF file
//!
//! ```no_run
//! # use std::{fs::File, io::{self, Read}};
//! use noodles_bgzf as bgzf;
//! let mut reader = File::open("data.gz").map(bgzf::Reader::new)?;
//! let mut data = Vec::new();
//! reader.read_to_end(&mut data)?;
//! # Ok::<(), io::Error>(())
//! ```
//!
//! ## Write a BGZF file
//!
//! ```no_run
//! # use std::{fs::File, io::{self, Write}};
//! use noodles_bgzf as bgzf;
//! let mut writer = File::create("data.gz").map(bgzf::Writer::new)?;
//! writer.write_all(b"noodles-bgzf")?;
//! # Ok::<(), io::Error>(())
//! ```

#[cfg(feature = "async")]
pub mod r#async;

mod block;
mod gz;
pub mod gzi;
pub mod indexed_reader;
mod multithreaded_reader;
pub mod multithreaded_writer;
pub mod reader;
pub mod virtual_position;
pub mod writer;

pub use self::{
    indexed_reader::IndexedReader, multithreaded_reader::MultithreadedReader,
    multithreaded_writer::MultithreadedWriter, reader::Reader, virtual_position::VirtualPosition,
    writer::Writer,
};

#[cfg(feature = "async")]
pub use self::r#async::{Reader as AsyncReader, Writer as AsyncWriter};

use self::block::Block;

// XLEN (2)
const GZIP_XLEN_SIZE: usize = 2;

// SI1 (1) + SI2 (1) + SLEN (2) + BSIZE (2)
const BGZF_XLEN: usize = 6;

// § 4.1 The BGZF compression format (2021-06-03): "Thus while `ISIZE` is stored as a `uint32_t` as
// per the gzip format, in BGZF it is limited to the range [0, 65536]."
const BGZF_MAX_ISIZE: usize = 1 << 16;

pub(crate) const BGZF_HEADER_SIZE: usize = gz::HEADER_SIZE + GZIP_XLEN_SIZE + BGZF_XLEN;

#[cfg(test)]
mod tests {
    use std::io::{self, BufRead, Read, Write};

    use super::*;

    #[test]
    fn test_self() -> io::Result<()> {
        let mut writer = Writer::new(Vec::new());

        writer.write_all(b"noodles")?;
        writer.flush()?;
        writer.write_all(b"-")?;
        writer.flush()?;
        writer.write_all(b"bgzf")?;

        let data = writer.finish()?;
        let mut reader = Reader::new(&data[..]);

        let mut buf = Vec::new();
        reader.read_to_end(&mut buf)?;

        assert_eq!(buf, b"noodles-bgzf");

        Ok(())
    }

    #[test]
    fn test_self_buffered() -> io::Result<()> {
        let mut writer = Writer::new(Vec::new());

        writer.write_all(b"noodles\n-\nbgzf\nbuffered")?;

        let data = writer.finish()?;
        let mut reader = Reader::new(&data[..]);

        let mut lines = Vec::new();
        let mut virtual_positions = Vec::new();

        loop {
            virtual_positions.push(reader.virtual_position());

            let mut line = String::new();
            match reader.read_line(&mut line) {
                Ok(0) => {
                    virtual_positions.pop();
                    break;
                }
                Err(e) => return Err(e),
                _ => (),
            }

            lines.push(line);
        }

        let expected_lines = vec!["noodles\n", "-\n", "bgzf\n", "buffered"];
        assert_eq!(lines, expected_lines);

        let expected_upos = [0, 8, 10, 15];
        let expected_virtual_positions: Vec<VirtualPosition> = expected_upos
            .iter()
            .map(|x| VirtualPosition::try_from((0, *x)).unwrap())
            .collect();
        assert_eq!(virtual_positions, expected_virtual_positions);

        Ok(())
    }
}