binseq/vbq/header.rs
1//! # File and Block Header Definitions
2//!
3//! This module defines the header structures used in the VBINSEQ file format.
4//!
5//! The VBINSEQ format consists of two primary header types:
6//!
7//! 1. `VBinseqHeader` - The file header that appears at the beginning of a VBINSEQ file,
8//! containing information about the overall file format and configuration.
9//!
10//! 2. `BlockHeader` - Headers that appear before each block of records, containing
11//! information specific to that block like its size and number of records.
12//!
13//! Both headers are fixed-size and include magic numbers to validate file integrity.
14
15use std::io::{Read, Write};
16
17use bitnuc::BitSize;
18use byteorder::{ByteOrder, LittleEndian};
19
20use crate::error::{HeaderError, ReadError, Result};
21
22/// Magic number for file identification: "VSEQ" in ASCII (0x51455356)
23///
24/// This constant is used in the file header to identify VBINSEQ formatted files.
25#[allow(clippy::unreadable_literal)]
26const MAGIC: u32 = 0x51455356;
27
28/// Magic number for block identification: "BLOCKSEQ" in ASCII (0x5145534B434F4C42)
29///
30/// This constant is used in block headers to validate block integrity.
31#[allow(clippy::unreadable_literal)]
32const BLOCK_MAGIC: u64 = 0x5145534B434F4C42;
33
34/// Current format version number
35///
36/// This should be incremented when making backwards-incompatible changes to the format.
37const FORMAT: u8 = 1;
38
39/// Size of the file header in bytes (32 bytes)
40///
41/// The file header has a fixed size to simplify parsing.
42pub const SIZE_HEADER: usize = 32;
43
44/// Size of the block header in bytes (32 bytes)
45///
46/// Each block header has a fixed size to simplify block navigation.
47pub const SIZE_BLOCK_HEADER: usize = 32;
48
49/// Default block size in bytes: 128KB
50///
51/// This defines the default virtual size of each record block.
52/// A larger block size can improve compression ratio but reduces random access granularity.
53pub const BLOCK_SIZE: u64 = 128 * 1024;
54
55/// Reserved bytes for future use in the file header
56///
57/// These bytes are set to a placeholder value (42) and reserved for future extensions.
58pub const RESERVED_BYTES: [u8; 13] = [42; 13];
59
60/// Reserved bytes for future use in block headers (12 bytes)
61///
62/// These bytes are set to a placeholder value (42) and reserved for future extensions.
63pub const RESERVED_BYTES_BLOCK: [u8; 12] = [42; 12];
64
65#[derive(Default, Debug, Clone, Copy)]
66pub struct VBinseqHeaderBuilder {
67 qual: Option<bool>,
68 block: Option<u64>,
69 compressed: Option<bool>,
70 paired: Option<bool>,
71 bitsize: Option<BitSize>,
72 headers: Option<bool>,
73 flags: Option<bool>,
74}
75impl VBinseqHeaderBuilder {
76 #[must_use]
77 pub fn new() -> Self {
78 Self::default()
79 }
80 #[must_use]
81 pub fn qual(mut self, qual: bool) -> Self {
82 self.qual = Some(qual);
83 self
84 }
85 #[must_use]
86 pub fn block(mut self, block: u64) -> Self {
87 self.block = Some(block);
88 self
89 }
90 #[must_use]
91 pub fn compressed(mut self, compressed: bool) -> Self {
92 self.compressed = Some(compressed);
93 self
94 }
95 #[must_use]
96 pub fn paired(mut self, paired: bool) -> Self {
97 self.paired = Some(paired);
98 self
99 }
100 #[must_use]
101 pub fn bitsize(mut self, bitsize: BitSize) -> Self {
102 self.bitsize = Some(bitsize);
103 self
104 }
105 #[must_use]
106 pub fn headers(mut self, headers: bool) -> Self {
107 self.headers = Some(headers);
108 self
109 }
110 #[must_use]
111 pub fn flags(mut self, flags: bool) -> Self {
112 self.flags = Some(flags);
113 self
114 }
115 #[must_use]
116 pub fn build(self) -> VBinseqHeader {
117 VBinseqHeader::with_capacity(
118 self.block.unwrap_or(BLOCK_SIZE),
119 self.qual.unwrap_or(false),
120 self.compressed.unwrap_or(false),
121 self.paired.unwrap_or(false),
122 self.bitsize.unwrap_or_default(),
123 self.headers.unwrap_or(false),
124 self.flags.unwrap_or(false),
125 )
126 }
127}
128
129/// File header for VBINSEQ files
130///
131/// This structure represents the 32-byte header that appears at the beginning of every
132/// VBINSEQ file. It contains configuration information about the file format, including
133/// whether quality scores are included, whether blocks are compressed, and whether
134/// records contain paired sequences.
135///
136/// # Fields
137///
138/// * `magic` - Magic number to validate file format ("VSEQ", 4 bytes)
139/// * `format` - Version number of the file format (1 byte)
140/// * `block` - Size of each block in bytes (8 bytes)
141/// * `qual` - Whether quality scores are included (1 byte boolean)
142/// * `compressed` - Whether blocks are ZSTD compressed (1 byte boolean)
143/// * `paired` - Whether records contain paired sequences (1 byte boolean)
144/// * `reserved` - Reserved bytes for future extensions (16 bytes)
145#[derive(Clone, Copy, Debug, PartialEq)]
146pub struct VBinseqHeader {
147 /// Magic number to identify the file format ("VSEQ")
148 ///
149 /// Always set to 0x51455356 (4 bytes)
150 pub magic: u32,
151
152 /// Version of the file format
153 ///
154 /// Currently set to 1 (1 byte)
155 pub format: u8,
156
157 /// Block size in bytes
158 ///
159 /// This is the virtual (uncompressed) size of each record block (8 bytes)
160 pub block: u64,
161
162 /// Whether quality scores are included with sequences
163 ///
164 /// If true, quality scores are stored for each nucleotide (1 byte)
165 pub qual: bool,
166
167 /// Whether internal blocks are compressed with ZSTD
168 ///
169 /// If true, blocks are compressed individually (1 byte)
170 pub compressed: bool,
171
172 /// Whether records contain paired sequences
173 ///
174 /// If true, each record has both primary and extended sequences (1 byte)
175 pub paired: bool,
176
177 /// The bitsize of the sequence data (1 byte)
178 ///
179 /// Specifies the number of bits per nucleotide:
180 /// - 2-bit: Standard encoding (A=00, C=01, G=10, T=11)
181 /// - 4-bit: Extended encoding supporting ambiguous nucleotides
182 pub bits: BitSize,
183
184 /// Whether sequence headers are included with sequences (1 byte)
185 ///
186 /// When true, each record includes length-prefixed UTF-8 header strings
187 /// for both primary and extended (paired) sequences
188 pub headers: bool,
189
190 /// Whether flags are included with sequences (1 byte)
191 ///
192 /// When true, each record includes length-prefixed UTF-8 flag strings
193 /// for both primary and extended (paired) sequences
194 pub flags: bool,
195
196 /// Reserved bytes for future format extensions
197 ///
198 /// Currently filled with placeholder values (13 bytes)
199 pub reserved: [u8; 13],
200}
201impl Default for VBinseqHeader {
202 /// Creates a default header with default block size and all features disabled
203 ///
204 /// The default header:
205 /// - Uses the default block size (128KB)
206 /// - Does not include quality scores
207 /// - Does not use compression
208 /// - Does not support paired sequences
209 /// - Does not include sequence headers
210 /// - Uses 2-bit nucleotide encoding
211 fn default() -> Self {
212 Self::with_capacity(
213 BLOCK_SIZE,
214 false,
215 false,
216 false,
217 BitSize::default(),
218 false,
219 false,
220 )
221 }
222}
223impl VBinseqHeader {
224 /// Creates a new VBINSEQ header with the default block size
225 ///
226 /// # Parameters
227 ///
228 /// * `qual` - Whether to include quality scores with sequences
229 /// * `compressed` - Whether to use ZSTD compression for blocks
230 /// * `paired` - Whether records contain paired sequences
231 /// * `bitsize` - Number of bits per nucleotide (2 or 4)
232 /// * `headers` - Whether to include sequence headers with records
233 ///
234 /// # Example
235 ///
236 /// ```rust
237 /// use binseq::vbq::VBinseqHeaderBuilder;
238 ///
239 /// // Create header with quality scores and compression, without paired sequences
240 /// let header = VBinseqHeaderBuilder::new()
241 /// .qual(true)
242 /// .compressed(true)
243 /// .build();
244 /// ```
245 #[must_use]
246 pub fn new(
247 qual: bool,
248 compressed: bool,
249 paired: bool,
250 bitsize: BitSize,
251 headers: bool,
252 flags: bool,
253 ) -> Self {
254 Self::with_capacity(
255 BLOCK_SIZE, qual, compressed, paired, bitsize, headers, flags,
256 )
257 }
258
259 /// Creates a new VBINSEQ header with a custom block size
260 ///
261 /// # Parameters
262 ///
263 /// * `block` - Custom block size in bytes (virtual/uncompressed size)
264 /// * `qual` - Whether to include quality scores with sequences
265 /// * `compressed` - Whether to use ZSTD compression for blocks
266 /// * `paired` - Whether records contain paired sequences
267 ///
268 /// # Example
269 ///
270 /// ```rust
271 /// use binseq::vbq::VBinseqHeaderBuilder;
272 ///
273 /// // Create header with a 256KB block size, with quality scores and compression
274 /// let header = VBinseqHeaderBuilder::new()
275 /// .block(256 * 1024)
276 /// .qual(true)
277 /// .compressed(true)
278 /// .build();
279 /// ```
280 #[must_use]
281 pub fn with_capacity(
282 block: u64,
283 qual: bool,
284 compressed: bool,
285 paired: bool,
286 bitsize: BitSize,
287 headers: bool,
288 flags: bool,
289 ) -> Self {
290 Self {
291 magic: MAGIC,
292 format: FORMAT,
293 block,
294 qual,
295 compressed,
296 paired,
297 headers,
298 flags,
299 bits: bitsize,
300 reserved: RESERVED_BYTES,
301 }
302 }
303
304 /// Sets the encoding bitsize for the header.
305 pub fn set_bitsize(&mut self, bits: BitSize) {
306 self.bits = bits;
307 }
308
309 /// Creates a header from a 32-byte buffer
310 ///
311 /// This function parses a raw byte buffer into a `VBinseqHeader` structure,
312 /// validating the magic number and format version.
313 ///
314 /// # Parameters
315 ///
316 /// * `buffer` - A 32-byte array containing the header data
317 ///
318 /// # Returns
319 ///
320 /// * `Result<Self>` - A valid header if parsing was successful
321 ///
322 /// # Errors
323 ///
324 /// * `HeaderError::InvalidMagicNumber` - If the magic number doesn't match "VSEQ"
325 /// * `HeaderError::InvalidFormatVersion` - If the format version is unsupported
326 /// * `HeaderError::InvalidReservedBytes` - If the reserved bytes section is invalid
327 pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result<Self> {
328 let magic = LittleEndian::read_u32(&buffer[0..4]);
329 if magic != MAGIC {
330 return Err(HeaderError::InvalidMagicNumber(magic).into());
331 }
332 let format = buffer[4];
333 if format != FORMAT {
334 return Err(HeaderError::InvalidFormatVersion(format).into());
335 }
336 let block = LittleEndian::read_u64(&buffer[5..13]);
337 let qual = buffer[13] != 0;
338 let compressed = buffer[14] != 0;
339 let paired = buffer[15] != 0;
340 let bits = match buffer[16] {
341 0 | 2 | 42 => BitSize::Two,
342 4 => BitSize::Four,
343 x => return Err(HeaderError::InvalidBitSize(x).into()),
344 };
345 let headers = match buffer[17] {
346 0 | 42 => false, // backwards compatibility
347 _ => true,
348 };
349 let flags = buffer[18] != 0;
350 let Ok(reserved) = buffer[19..32].try_into() else {
351 return Err(HeaderError::InvalidReservedBytes.into());
352 };
353 Ok(Self {
354 magic,
355 format,
356 block,
357 qual,
358 compressed,
359 paired,
360 bits,
361 headers,
362 flags,
363 reserved,
364 })
365 }
366
367 /// Writes the header to a writer
368 ///
369 /// This function serializes the header structure into a 32-byte buffer and writes
370 /// it to the provided writer.
371 ///
372 /// # Parameters
373 ///
374 /// * `writer` - Any type that implements the `Write` trait
375 ///
376 /// # Returns
377 ///
378 /// * `Result<()>` - Success if the header was written
379 ///
380 /// # Errors
381 ///
382 /// * IO errors if writing to the writer fails
383 pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
384 let mut buffer = [0u8; SIZE_HEADER];
385 LittleEndian::write_u32(&mut buffer[0..4], self.magic);
386 buffer[4] = self.format;
387 LittleEndian::write_u64(&mut buffer[5..13], self.block);
388 buffer[13] = self.qual.into();
389 buffer[14] = self.compressed.into();
390 buffer[15] = self.paired.into();
391 buffer[16] = self.bits.into();
392 buffer[17] = self.headers.into();
393 buffer[18] = self.flags.into();
394 buffer[19..32].copy_from_slice(&self.reserved);
395 writer.write_all(&buffer)?;
396 Ok(())
397 }
398
399 /// Reads a header from a reader
400 ///
401 /// This function reads 32 bytes from the provided reader and parses them into
402 /// a `VBinseqHeader` structure.
403 ///
404 /// # Parameters
405 ///
406 /// * `reader` - Any type that implements the `Read` trait
407 ///
408 /// # Returns
409 ///
410 /// * `Result<Self>` - A valid header if reading and parsing was successful
411 ///
412 /// # Errors
413 ///
414 /// * IO errors if reading from the reader fails
415 /// * Header validation errors from `from_bytes()`
416 pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
417 let mut buffer = [0u8; SIZE_HEADER];
418 reader.read_exact(&mut buffer)?;
419 Self::from_bytes(&buffer)
420 }
421
422 #[must_use]
423 pub fn is_paired(&self) -> bool {
424 self.paired
425 }
426}
427
428/// Block header for VBINSEQ block data
429///
430/// Each block in a VBINSEQ file is preceded by a 32-byte block header that contains
431/// information about the block including its size and the number of records it contains.
432///
433/// # Fields
434///
435/// * `magic` - Magic number to validate block integrity ("BLOCKSEQ", 8 bytes)
436/// * `size` - Actual size of the block in bytes (8 bytes)
437/// * `records` - Number of records in the block (4 bytes)
438/// * `reserved` - Reserved bytes for future extensions (12 bytes)
439#[derive(Clone, Copy, Debug)]
440pub struct BlockHeader {
441 /// Magic number to identify the block ("BLOCKSEQ")
442 ///
443 /// Always set to 0x5145534B434F4C42 (8 bytes)
444 pub magic: u64,
445
446 /// Actual size of the block in bytes
447 ///
448 /// This can differ from the virtual block size in the file header
449 /// when compression is enabled (8 bytes)
450 pub size: u64,
451
452 /// Number of records stored in this block
453 ///
454 /// Used to iterate through records efficiently (4 bytes)
455 pub records: u32,
456
457 /// Reserved bytes for future extensions
458 ///
459 /// Currently filled with placeholder values (12 bytes)
460 pub reserved: [u8; 12],
461}
462impl BlockHeader {
463 /// Creates a new block header
464 ///
465 /// # Parameters
466 ///
467 /// * `size` - The actual size of the block in bytes (can be compressed size)
468 /// * `records` - The number of records contained in the block
469 ///
470 /// # Example
471 ///
472 /// ```rust
473 /// use binseq::vbq::BlockHeader;
474 ///
475 /// // Create a block header for a block with 1024 bytes and 100 records
476 /// let header = BlockHeader::new(1024, 100);
477 /// ```
478 #[must_use]
479 pub fn new(size: u64, records: u32) -> Self {
480 Self {
481 magic: BLOCK_MAGIC,
482 size,
483 records,
484 reserved: RESERVED_BYTES_BLOCK,
485 }
486 }
487
488 #[must_use]
489 pub fn empty() -> Self {
490 Self {
491 magic: BLOCK_MAGIC,
492 size: 0,
493 records: 0,
494 reserved: RESERVED_BYTES_BLOCK,
495 }
496 }
497
498 #[must_use]
499 pub fn is_empty(&self) -> bool {
500 self.size == 0 && self.records == 0
501 }
502
503 /// Writes the block header to a writer
504 ///
505 /// This function serializes the block header structure into a 32-byte buffer and writes
506 /// it to the provided writer.
507 ///
508 /// # Parameters
509 ///
510 /// * `writer` - Any type that implements the `Write` trait
511 ///
512 /// # Returns
513 ///
514 /// * `Result<()>` - Success if the header was written
515 ///
516 /// # Errors
517 ///
518 /// * IO errors if writing to the writer fails
519 pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
520 let mut buffer = [0u8; SIZE_BLOCK_HEADER];
521 LittleEndian::write_u64(&mut buffer[0..8], self.magic);
522 LittleEndian::write_u64(&mut buffer[8..16], self.size);
523 LittleEndian::write_u32(&mut buffer[16..20], self.records);
524 buffer[20..].copy_from_slice(&self.reserved);
525 writer.write_all(&buffer)?;
526 Ok(())
527 }
528
529 /// Creates a block header from a 32-byte buffer
530 ///
531 /// This function parses a raw byte buffer into a `BlockHeader` structure,
532 /// validating the magic number.
533 ///
534 /// # Parameters
535 ///
536 /// * `buffer` - A 32-byte array containing the block header data
537 ///
538 /// # Returns
539 ///
540 /// * `Result<Self>` - A valid block header if parsing was successful
541 ///
542 /// # Errors
543 ///
544 /// * `ReadError::InvalidBlockMagicNumber` - If the magic number doesn't match "BLOCKSEQ"
545 pub fn from_bytes(buffer: &[u8; SIZE_BLOCK_HEADER]) -> Result<Self> {
546 let magic = LittleEndian::read_u64(&buffer[0..8]);
547 if magic != BLOCK_MAGIC {
548 return Err(ReadError::InvalidBlockMagicNumber(magic, 0).into());
549 }
550 let size = LittleEndian::read_u64(&buffer[8..16]);
551 let records = LittleEndian::read_u32(&buffer[16..20]);
552 Ok(Self::new(size, records))
553 }
554
555 #[must_use]
556 pub fn size_with_header(&self) -> usize {
557 self.size as usize + SIZE_BLOCK_HEADER
558 }
559}