binseq/bq/
mod.rs

1//! # bq
2//!
3//! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**.
4//!
5//! For variable-length records and optional quality scores use the [`cbq`](crate::cbq) or [`vbq`](crate::vbq) modules.
6//!
7//! This module contains the utilities for reading, writing, and interacting with BQ files.
8//!
9//! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
10//!
11//! ## Usage
12//!
13//! ### Reading
14//! ```rust
15//! use binseq::{bq, BinseqRecord};
16//! use rand::{thread_rng, Rng};
17//!
18//! let path = "./data/subset.bq";
19//! let reader = bq::MmapReader::new(path).unwrap();
20//!
21//! // We can easily determine the number of records in the file
22//! let num_records = reader.num_records();
23//!
24//! // We have random access to any record within the range
25//! let random_index = thread_rng().gen_range(0..num_records);
26//! let record = reader.get(random_index).unwrap();
27//!
28//! // We can easily decode the (2bit)encoded sequence back to a sequence of bytes
29//! let mut sbuf = Vec::new();
30//! let mut xbuf = Vec::new();
31//!
32//! record.decode_s(&mut sbuf);
33//! if record.is_paired() {
34//!     record.decode_x(&mut xbuf);
35//! }
36//! ```
37//!
38//! ### Writing
39//!
40//! #### Writing unpaired sequences
41//!
42//! ```rust
43//! use binseq::{bq, SequencingRecordBuilder};
44//! use std::io::Cursor;
45//!
46//! // Create an in-memory buffer for output
47//! let output_handle = Cursor::new(Vec::new());
48//!
49//! // Initialize our BQ header (64 bp, only primary)
50//! let header = bq::FileHeaderBuilder::new().slen(64).build().unwrap();
51//!
52//! // Initialize our BQ writer
53//! let mut writer = bq::WriterBuilder::default()
54//!     .header(header)
55//!     .build(output_handle)
56//!     .unwrap();
57//!
58//! // Generate a random sequence
59//! let seq = [b'A'; 64];
60//!
61//! // Build a record and write it to the file
62//! let record = SequencingRecordBuilder::default()
63//!     .s_seq(&seq)
64//!     .flag(0)
65//!     .build()
66//!     .unwrap();
67//! writer.push(record).unwrap();
68//!
69//! // Flush the writer
70//! writer.flush().unwrap();
71//! ```
72//!
73//! #### Writing paired sequences
74//!
75//! ```rust
76//! use binseq::{bq, SequencingRecordBuilder};
77//! use std::io::Cursor;
78//!
79//! // Create an in-memory buffer for output
80//! let output_handle = Cursor::new(Vec::new());
81//!
82//! // Initialize our BQ header (64 bp and 128bp)
83//! let header = bq::FileHeaderBuilder::new().slen(64).xlen(128).build().unwrap();
84//!
85//! // Initialize our BQ writer
86//! let mut writer = bq::WriterBuilder::default()
87//!     .header(header)
88//!     .build(output_handle)
89//!     .unwrap();
90//!
91//! // Generate paired sequences
92//! let primary = [b'A'; 64];
93//! let secondary = [b'C'; 128];
94//!
95//! // Build a paired record and write it to the file
96//! let record = SequencingRecordBuilder::default()
97//!     .s_seq(&primary)
98//!     .x_seq(&secondary)
99//!     .flag(0)
100//!     .build()
101//!     .unwrap();
102//! writer.push(record).unwrap();
103//!
104//! // Flush the writer
105//! writer.flush().unwrap();
106//! ```
107//!
108//! # Example: Streaming Access
109//!
110//! ```
111//! use binseq::{Policy, Result, BinseqRecord, SequencingRecordBuilder};
112//! use binseq::bq::{FileHeaderBuilder, StreamReader, StreamWriterBuilder};
113//! use std::io::{BufReader, Cursor};
114//!
115//! fn main() -> Result<()> {
116//!     // Create a header for sequences of length 100
117//!     let header = FileHeaderBuilder::new().slen(100).build()?;
118//!
119//!     // Create a stream writer
120//!     let mut writer = StreamWriterBuilder::default()
121//!         .header(header)
122//!         .buffer_capacity(8192)
123//!         .build(Cursor::new(Vec::new()))?;
124//!
125//!     // Write sequences
126//!     let sequence = b"ACGT".repeat(25); // 100 nucleotides
127//!     let record = SequencingRecordBuilder::default()
128//!         .s_seq(&sequence)
129//!         .flag(0)
130//!         .build()?;
131//!     writer.push(record)?;
132//!
133//!     // Get the inner buffer
134//!     let buffer = writer.into_inner()?;
135//!     let data = buffer.into_inner();
136//!
137//!     // Create a stream reader
138//!     let mut reader = StreamReader::new(BufReader::new(Cursor::new(data)));
139//!
140//!     // Process records as they arrive
141//!     while let Some(record) = reader.next_record() {
142//!         // Process each record
143//!         let record = record?;
144//!         let flag = record.flag();
145//!     }
146//!
147//!     Ok(())
148//! }
149//! ```
150//!
151//! ## BQ file format
152//!
153//! A BQ file consists of two sections:
154//!
155//! 1. Fixed-size header (32 bytes)
156//! 2. Record data section
157//!
158//! ### Header Format (32 bytes total)
159//!
160//! | Offset | Size (bytes) | Name     | Description                  | Type   |
161//! | ------ | ------------ | -------- | ---------------------------- | ------ |
162//! | 0      | 4            | magic    | Magic number (0x42534551)    | uint32 |
163//! | 4      | 1            | format   | Format version (currently 2) | uint8  |
164//! | 5      | 4            | slen     | Sequence length (primary)    | uint32 |
165//! | 9      | 4            | xlen     | Sequence length (secondary)  | uint32 |
166//! | 13     | 19           | reserved | Reserved for future use      | bytes  |
167//!
168//! ### Record Format
169//!
170//! Each record consists of a:
171//!
172//! 1. Flag field (8 bytes, uint64)
173//! 2. Sequence data (ceil(N/32) \* 8 bytes, where N is sequence length)
174//!
175//! The flag field is implementation-defined and can be used for filtering, metadata, or other purposes. The placement of the flag field at the start of each record enables efficient filtering without reading sequence data.
176//!
177//! Total record size = 8 + (ceil(N/32) \* 8) bytes, where N is sequence length
178//!
179//! ## Encoding
180//!
181//! - Each nucleotide is encoded using 2 bits:
182//!   - A = 00
183//!   - C = 01
184//!   - G = 10
185//!   - T = 11
186//! - Non-ATCG characters are **unsupported**.
187//! - Sequences are stored in Little-Endian order
188//! - The final u64 of sequence data is padded with zeros if the sequence length is not divisible by 32
189//!
190//! See [`bitnuc`] for 2bit implementation details.
191//!
192//! ## bq implementation Notes
193//!
194//! - Sequences are stored in u64 chunks, each holding up to 32 bases
195//! - Random access to any record can be calculated as:
196//!   - record_size = 8 + (ceil(sequence_length/32) \* 8)
197//!   - record_start = 16 + (record_index \* record_size)
198//! - Total number of records can be calculated as: (file_size - 16) / record_size
199//! - Flag field placement allows for efficient filtering strategies:
200//!   - Records can be skipped based on flag values without reading sequence data
201//!   - Flag checks can be vectorized for parallel processing
202//!   - Memory access patterns are predictable for better cache utilization
203//!
204//! ## Example Storage Requirements
205//!
206//! Common sequence lengths:
207//!
208//! - 32bp reads:
209//!   - Sequence: 1 \* 8 = 8 bytes (fits in one u64)
210//!   - Flag: 8 bytes
211//!   - Total per record: 16 bytes
212//! - 100bp reads:
213//!   - Sequence: 4 \* 8 = 32 bytes (requires four u64s)
214//!   - Flag: 8 bytes
215//!   - Total per record: 40 bytes
216//! - 150bp reads:
217//!   - Sequence: 5 \* 8 = 40 bytes (requires five u64s)
218//!   - Flag: 8 bytes
219//!   - Total per record: 48 bytes
220//!
221//! ## Validation
222//!
223//! Implementations should verify:
224//!
225//! 1. Correct magic number
226//! 2. Compatible version number
227//! 3. Sequence length is greater than 0
228//! 4. File size minus header (32 bytes) is divisible by the record size
229//!
230//! ## Future Considerations
231//!
232//! - The 19 reserved bytes in the header allow for future format extensions
233//! - The 64-bit flag field provides space for implementation-specific features such as:
234//!   - Quality score summaries
235//!   - Filtering flags
236//!   - Read group identifiers
237//!   - Processing state
238//!   - Count data
239
240mod header;
241mod reader;
242mod writer;
243
244pub use header::{FileHeader, FileHeaderBuilder, SIZE_HEADER};
245pub use reader::{MmapReader, RefRecord, StreamReader};
246pub use writer::{Encoder, StreamWriter, StreamWriterBuilder, Writer, WriterBuilder};