binseq/bq/mod.rs
1//! # bq
2//!
3//! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**.
4//!
5//! For variable-length records and optional quality scores use the [`vbq`](crate::vbq) module.
6//!
7//! This module contains the utilities for reading, writing, and interacting with BINSEQ files.
8//!
9//! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
10//!
11//! ## Usage
12//!
13//! ### Reading
14//! ```rust
15//! use binseq::{bq, BinseqRecord};
16//! use rand::{thread_rng, Rng};
17//!
18//! let path = "./data/subset.bq";
19//! let reader = bq::MmapReader::new(path).unwrap();
20//!
21//! // We can easily determine the number of records in the file
22//! let num_records = reader.num_records();
23//!
24//! // We have random access to any record within the range
25//! let random_index = thread_rng().gen_range(0..num_records);
26//! let record = reader.get(random_index).unwrap();
27//!
28//! // We can easily decode the (2bit)encoded sequence back to a sequence of bytes
29//! let mut sbuf = Vec::new();
30//! let mut xbuf = Vec::new();
31//!
32//! record.decode_s(&mut sbuf);
33//! if record.is_paired() {
34//! record.decode_x(&mut xbuf);
35//! }
36//! ```
37//!
38//! ### Writing
39//!
40//! #### Writing unpaired sequences
41//!
42//! ```rust
43//! use binseq::bq;
44//! use std::fs::File;
45//!
46//! // Define a path for the output file
47//! let path = "./data/some_output.bq";
48//!
49//! // Create the file handle
50//! let output_handle = File::create(path).unwrap();
51//!
52//! // Initialize our BINSEQ header (64 bp, only primary)
53//! let header = bq::BinseqHeaderBuilder::new().slen(64).build().unwrap();
54//!
55//! // Initialize our BINSEQ writer
56//! let mut writer = bq::BinseqWriterBuilder::default()
57//! .header(header)
58//! .build(output_handle)
59//! .unwrap();
60//!
61//! // Generate a random sequence
62//! let seq = [b'A'; 64];
63//! let flag = 0;
64//!
65//! // Write the sequence to the file
66//! writer.write_record(Some(flag), &seq).unwrap();
67//!
68//! // Close the file
69//! writer.flush().unwrap();
70//!
71//! // Remove the file created
72//! std::fs::remove_file(path).unwrap();
73//! ```
74//!
75//! #### Writing paired sequences
76//!
77//! ```rust
78//! use binseq::bq;
79//! use std::fs::File;
80//!
81//! // Define a path for the output file
82//! let path = "./data/some_output.bq";
83//!
84//! // Create the file handle
85//! let output_handle = File::create(path).unwrap();
86//!
87//! // Initialize our BINSEQ header (64 bp and 128bp)
88//! let header = bq::BinseqHeaderBuilder::new().slen(64).xlen(128).build().unwrap();
89//!
90//! // Initialize our BINSEQ writer
91//! let mut writer = bq::BinseqWriterBuilder::default()
92//! .header(header)
93//! .build(output_handle)
94//! .unwrap();
95//!
96//! // Generate a random sequence
97//! let primary = [b'A'; 64];
98//! let secondary = [b'C'; 128];
99//! let flag = 0;
100//!
101//! // Write the sequence to the file
102//! writer.write_paired_record(Some(flag), &primary, &secondary).unwrap();
103//!
104//! // Close the file
105//! writer.flush().unwrap();
106//!
107//! // Remove the file created
108//! std::fs::remove_file(path).unwrap();
109//! ```
110//!
111//! # Example: Streaming Access
112//!
113//! ```
114//! use binseq::{Policy, Result, BinseqRecord};
115//! use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
116//! use std::io::{BufReader, Cursor};
117//!
118//! fn main() -> Result<()> {
119//! // Create a header for sequences of length 100
120//! let header = BinseqHeaderBuilder::new().slen(100).build()?;
121//!
122//! // Create a stream writer
123//! let mut writer = StreamWriterBuilder::default()
124//! .header(header)
125//! .buffer_capacity(8192)
126//! .build(Cursor::new(Vec::new()))?;
127//!
128//! // Write sequences
129//! let sequence = b"ACGT".repeat(25); // 100 nucleotides
130//! writer.write_record(Some(0), &sequence)?;
131//!
132//! // Get the inner buffer
133//! let buffer = writer.into_inner()?;
134//! let data = buffer.into_inner();
135//!
136//! // Create a stream reader
137//! let mut reader = StreamReader::new(BufReader::new(Cursor::new(data)));
138//!
139//! // Process records as they arrive
140//! while let Some(record) = reader.next_record() {
141//! // Process each record
142//! let record = record?;
143//! let flag = record.flag();
144//! }
145//!
146//! Ok(())
147//! }
148//! ```
149//!
150//! ## BQ file format
151//!
152//! A BINSEQ file consists of two sections:
153//!
154//! 1. Fixed-size header (32 bytes)
155//! 2. Record data section
156//!
157//! ### Header Format (32 bytes total)
158//!
159//! | Offset | Size (bytes) | Name | Description | Type |
160//! | ------ | ------------ | -------- | ---------------------------- | ------ |
161//! | 0 | 4 | magic | Magic number (0x42534551) | uint32 |
162//! | 4 | 1 | format | Format version (currently 2) | uint8 |
163//! | 5 | 4 | slen | Sequence length (primary) | uint32 |
164//! | 9 | 4 | xlen | Sequence length (secondary) | uint32 |
165//! | 13 | 19 | reserved | Reserved for future use | bytes |
166//!
167//! ### Record Format
168//!
169//! Each record consists of a:
170//!
171//! 1. Flag field (8 bytes, uint64)
172//! 2. Sequence data (ceil(N/32) \* 8 bytes, where N is sequence length)
173//!
174//! The flag field is implementation-defined and can be used for filtering, metadata, or other purposes. The placement of the flag field at the start of each record enables efficient filtering without reading sequence data.
175//!
176//! Total record size = 8 + (ceil(N/32) \* 8) bytes, where N is sequence length
177//!
178//! ## Encoding
179//!
180//! - Each nucleotide is encoded using 2 bits:
181//! - A = 00
182//! - C = 01
183//! - G = 10
184//! - T = 11
185//! - Non-ATCG characters are **unsupported**.
186//! - Sequences are stored in Little-Endian order
187//! - The final u64 of sequence data is padded with zeros if the sequence length is not divisible by 32
188//!
189//! See [`bitnuc`] for 2bit implementation details.
190//!
191//! ## bq implementation Notes
192//!
193//! - Sequences are stored in u64 chunks, each holding up to 32 bases
194//! - Random access to any record can be calculated as:
195//! - record_size = 8 + (ceil(sequence_length/32) \* 8)
196//! - record_start = 16 + (record_index \* record_size)
197//! - Total number of records can be calculated as: (file_size - 16) / record_size
198//! - Flag field placement allows for efficient filtering strategies:
199//! - Records can be skipped based on flag values without reading sequence data
200//! - Flag checks can be vectorized for parallel processing
201//! - Memory access patterns are predictable for better cache utilization
202//!
203//! ## Example Storage Requirements
204//!
205//! Common sequence lengths:
206//!
207//! - 32bp reads:
208//! - Sequence: 1 \* 8 = 8 bytes (fits in one u64)
209//! - Flag: 8 bytes
210//! - Total per record: 16 bytes
211//! - 100bp reads:
212//! - Sequence: 4 \* 8 = 32 bytes (requires four u64s)
213//! - Flag: 8 bytes
214//! - Total per record: 40 bytes
215//! - 150bp reads:
216//! - Sequence: 5 \* 8 = 40 bytes (requires five u64s)
217//! - Flag: 8 bytes
218//! - Total per record: 48 bytes
219//!
220//! ## Validation
221//!
222//! Implementations should verify:
223//!
224//! 1. Correct magic number
225//! 2. Compatible version number
226//! 3. Sequence length is greater than 0
227//! 4. File size minus header (32 bytes) is divisible by the record size
228//!
229//! ## Future Considerations
230//!
231//! - The 19 reserved bytes in the header allow for future format extensions
232//! - The 64-bit flag field provides space for implementation-specific features such as:
233//! - Quality score summaries
234//! - Filtering flags
235//! - Read group identifiers
236//! - Processing state
237//! - Count data
238
239mod header;
240mod reader;
241mod writer;
242
243pub use header::{BinseqHeader, BinseqHeaderBuilder, SIZE_HEADER};
244pub use reader::{MmapReader, RefRecord, StreamReader};
245pub use writer::{BinseqWriter, BinseqWriterBuilder, Encoder, StreamWriter, StreamWriterBuilder};