binseq/vbq/
header.rs

1//! # File and Block Header Definitions
2//!
3//! This module defines the header structures used in the VBINSEQ file format.
4//!
5//! The VBINSEQ format consists of two primary header types:
6//!
7//! 1. `VBinseqHeader` - The file header that appears at the beginning of a VBINSEQ file,
8//!    containing information about the overall file format and configuration.
9//!
10//! 2. `BlockHeader` - Headers that appear before each block of records, containing
11//!    information specific to that block like its size and number of records.
12//!
13//! Both headers are fixed-size and include magic numbers to validate file integrity.
14
15use std::io::{Read, Write};
16
17use bitnuc::BitSize;
18use byteorder::{ByteOrder, LittleEndian};
19
20use crate::error::{HeaderError, ReadError, Result};
21
22/// Magic number for file identification: "VSEQ" in ASCII (0x51455356)
23///
24/// This constant is used in the file header to identify VBINSEQ formatted files.
25#[allow(clippy::unreadable_literal)]
26const MAGIC: u32 = 0x51455356;
27
28/// Magic number for block identification: "BLOCKSEQ" in ASCII (0x5145534B434F4C42)
29///
30/// This constant is used in block headers to validate block integrity.
31#[allow(clippy::unreadable_literal)]
32const BLOCK_MAGIC: u64 = 0x5145534B434F4C42;
33
34/// Current format version number
35///
36/// This should be incremented when making backwards-incompatible changes to the format.
37const FORMAT: u8 = 1;
38
39/// Size of the file header in bytes (32 bytes)
40///
41/// The file header has a fixed size to simplify parsing.
42pub const SIZE_HEADER: usize = 32;
43
44/// Size of the block header in bytes (32 bytes)
45///
46/// Each block header has a fixed size to simplify block navigation.
47pub const SIZE_BLOCK_HEADER: usize = 32;
48
49/// Default block size in bytes: 128KB
50///
51/// This defines the default virtual size of each record block.
52/// A larger block size can improve compression ratio but reduces random access granularity.
53pub const BLOCK_SIZE: u64 = 128 * 1024;
54
55/// Reserved bytes for future use in the file header
56///
57/// These bytes are set to a placeholder value (42) and reserved for future extensions.
58pub const RESERVED_BYTES: [u8; 13] = [42; 13];
59
60/// Reserved bytes for future use in block headers (12 bytes)
61///
62/// These bytes are set to a placeholder value (42) and reserved for future extensions.
63pub const RESERVED_BYTES_BLOCK: [u8; 12] = [42; 12];
64
65#[derive(Default, Debug, Clone, Copy)]
66pub struct VBinseqHeaderBuilder {
67    qual: Option<bool>,
68    block: Option<u64>,
69    compressed: Option<bool>,
70    paired: Option<bool>,
71    bitsize: Option<BitSize>,
72    headers: Option<bool>,
73    flags: Option<bool>,
74}
75impl VBinseqHeaderBuilder {
76    #[must_use]
77    pub fn new() -> Self {
78        Self::default()
79    }
80    #[must_use]
81    pub fn qual(mut self, qual: bool) -> Self {
82        self.qual = Some(qual);
83        self
84    }
85    #[must_use]
86    pub fn block(mut self, block: u64) -> Self {
87        self.block = Some(block);
88        self
89    }
90    #[must_use]
91    pub fn compressed(mut self, compressed: bool) -> Self {
92        self.compressed = Some(compressed);
93        self
94    }
95    #[must_use]
96    pub fn paired(mut self, paired: bool) -> Self {
97        self.paired = Some(paired);
98        self
99    }
100    #[must_use]
101    pub fn bitsize(mut self, bitsize: BitSize) -> Self {
102        self.bitsize = Some(bitsize);
103        self
104    }
105    #[must_use]
106    pub fn headers(mut self, headers: bool) -> Self {
107        self.headers = Some(headers);
108        self
109    }
110    #[must_use]
111    pub fn flags(mut self, flags: bool) -> Self {
112        self.flags = Some(flags);
113        self
114    }
115    #[must_use]
116    pub fn build(self) -> VBinseqHeader {
117        VBinseqHeader::with_capacity(
118            self.block.unwrap_or(BLOCK_SIZE),
119            self.qual.unwrap_or(false),
120            self.compressed.unwrap_or(false),
121            self.paired.unwrap_or(false),
122            self.bitsize.unwrap_or_default(),
123            self.headers.unwrap_or(false),
124            self.flags.unwrap_or(false),
125        )
126    }
127}
128
129/// File header for VBINSEQ files
130///
131/// This structure represents the 32-byte header that appears at the beginning of every
132/// VBINSEQ file. It contains configuration information about the file format, including
133/// whether quality scores are included, whether blocks are compressed, and whether
134/// records contain paired sequences.
135///
136/// # Fields
137///
138/// * `magic` - Magic number to validate file format ("VSEQ", 4 bytes)
139/// * `format` - Version number of the file format (1 byte)
140/// * `block` - Size of each block in bytes (8 bytes)
141/// * `qual` - Whether quality scores are included (1 byte boolean)
142/// * `compressed` - Whether blocks are ZSTD compressed (1 byte boolean)
143/// * `paired` - Whether records contain paired sequences (1 byte boolean)
144/// * `reserved` - Reserved bytes for future extensions (16 bytes)
145#[derive(Clone, Copy, Debug, PartialEq)]
146pub struct VBinseqHeader {
147    /// Magic number to identify the file format ("VSEQ")
148    ///
149    /// Always set to 0x51455356 (4 bytes)
150    pub magic: u32,
151
152    /// Version of the file format
153    ///
154    /// Currently set to 1 (1 byte)
155    pub format: u8,
156
157    /// Block size in bytes
158    ///
159    /// This is the virtual (uncompressed) size of each record block (8 bytes)
160    pub block: u64,
161
162    /// Whether quality scores are included with sequences
163    ///
164    /// If true, quality scores are stored for each nucleotide (1 byte)
165    pub qual: bool,
166
167    /// Whether internal blocks are compressed with ZSTD
168    ///
169    /// If true, blocks are compressed individually (1 byte)
170    pub compressed: bool,
171
172    /// Whether records contain paired sequences
173    ///
174    /// If true, each record has both primary and extended sequences (1 byte)
175    pub paired: bool,
176
177    /// The bitsize of the sequence data (1 byte)
178    ///
179    /// Specifies the number of bits per nucleotide:
180    /// - 2-bit: Standard encoding (A=00, C=01, G=10, T=11)
181    /// - 4-bit: Extended encoding supporting ambiguous nucleotides
182    pub bits: BitSize,
183
184    /// Whether sequence headers are included with sequences (1 byte)
185    ///
186    /// When true, each record includes length-prefixed UTF-8 header strings
187    /// for both primary and extended (paired) sequences
188    pub headers: bool,
189
190    /// Whether flags are included with sequences (1 byte)
191    ///
192    /// When true, each record includes length-prefixed UTF-8 flag strings
193    /// for both primary and extended (paired) sequences
194    pub flags: bool,
195
196    /// Reserved bytes for future format extensions
197    ///
198    /// Currently filled with placeholder values (13 bytes)
199    pub reserved: [u8; 13],
200}
201impl Default for VBinseqHeader {
202    /// Creates a default header with default block size and all features disabled
203    ///
204    /// The default header:
205    /// - Uses the default block size (128KB)
206    /// - Does not include quality scores
207    /// - Does not use compression
208    /// - Does not support paired sequences
209    /// - Does not include sequence headers
210    /// - Uses 2-bit nucleotide encoding
211    fn default() -> Self {
212        Self::with_capacity(
213            BLOCK_SIZE,
214            false,
215            false,
216            false,
217            BitSize::default(),
218            false,
219            false,
220        )
221    }
222}
223impl VBinseqHeader {
224    /// Creates a new VBINSEQ header with the default block size
225    ///
226    /// # Parameters
227    ///
228    /// * `qual` - Whether to include quality scores with sequences
229    /// * `compressed` - Whether to use ZSTD compression for blocks
230    /// * `paired` - Whether records contain paired sequences
231    /// * `bitsize` - Number of bits per nucleotide (2 or 4)
232    /// * `headers` - Whether to include sequence headers with records
233    ///
234    /// # Example
235    ///
236    /// ```rust
237    /// use binseq::vbq::VBinseqHeaderBuilder;
238    ///
239    /// // Create header with quality scores and compression, without paired sequences
240    /// let header = VBinseqHeaderBuilder::new()
241    ///     .qual(true)
242    ///     .compressed(true)
243    ///     .build();
244    /// ```
245    #[must_use]
246    pub fn new(
247        qual: bool,
248        compressed: bool,
249        paired: bool,
250        bitsize: BitSize,
251        headers: bool,
252        flags: bool,
253    ) -> Self {
254        Self::with_capacity(
255            BLOCK_SIZE, qual, compressed, paired, bitsize, headers, flags,
256        )
257    }
258
259    /// Creates a new VBINSEQ header with a custom block size
260    ///
261    /// # Parameters
262    ///
263    /// * `block` - Custom block size in bytes (virtual/uncompressed size)
264    /// * `qual` - Whether to include quality scores with sequences
265    /// * `compressed` - Whether to use ZSTD compression for blocks
266    /// * `paired` - Whether records contain paired sequences
267    ///
268    /// # Example
269    ///
270    /// ```rust
271    /// use binseq::vbq::VBinseqHeaderBuilder;
272    ///
273    /// // Create header with a 256KB block size, with quality scores and compression
274    /// let header = VBinseqHeaderBuilder::new()
275    ///     .block(256 * 1024)
276    ///     .qual(true)
277    ///     .compressed(true)
278    ///     .build();
279    /// ```
280    #[must_use]
281    pub fn with_capacity(
282        block: u64,
283        qual: bool,
284        compressed: bool,
285        paired: bool,
286        bitsize: BitSize,
287        headers: bool,
288        flags: bool,
289    ) -> Self {
290        Self {
291            magic: MAGIC,
292            format: FORMAT,
293            block,
294            qual,
295            compressed,
296            paired,
297            headers,
298            flags,
299            bits: bitsize,
300            reserved: RESERVED_BYTES,
301        }
302    }
303
304    /// Sets the encoding bitsize for the header.
305    pub fn set_bitsize(&mut self, bits: BitSize) {
306        self.bits = bits;
307    }
308
309    /// Creates a header from a 32-byte buffer
310    ///
311    /// This function parses a raw byte buffer into a `VBinseqHeader` structure,
312    /// validating the magic number and format version.
313    ///
314    /// # Parameters
315    ///
316    /// * `buffer` - A 32-byte array containing the header data
317    ///
318    /// # Returns
319    ///
320    /// * `Result<Self>` - A valid header if parsing was successful
321    ///
322    /// # Errors
323    ///
324    /// * `HeaderError::InvalidMagicNumber` - If the magic number doesn't match "VSEQ"
325    /// * `HeaderError::InvalidFormatVersion` - If the format version is unsupported
326    /// * `HeaderError::InvalidReservedBytes` - If the reserved bytes section is invalid
327    pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result<Self> {
328        let magic = LittleEndian::read_u32(&buffer[0..4]);
329        if magic != MAGIC {
330            return Err(HeaderError::InvalidMagicNumber(magic).into());
331        }
332        let format = buffer[4];
333        if format != FORMAT {
334            return Err(HeaderError::InvalidFormatVersion(format).into());
335        }
336        let block = LittleEndian::read_u64(&buffer[5..13]);
337        let qual = buffer[13] != 0;
338        let compressed = buffer[14] != 0;
339        let paired = buffer[15] != 0;
340        let bits = match buffer[16] {
341            0 | 2 | 42 => BitSize::Two,
342            4 => BitSize::Four,
343            x => return Err(HeaderError::InvalidBitSize(x).into()),
344        };
345        let headers = match buffer[17] {
346            0 | 42 => false, // backwards compatibility
347            _ => true,
348        };
349        let flags = buffer[18] != 0;
350        let Ok(reserved) = buffer[19..32].try_into() else {
351            return Err(HeaderError::InvalidReservedBytes.into());
352        };
353        Ok(Self {
354            magic,
355            format,
356            block,
357            qual,
358            compressed,
359            paired,
360            bits,
361            headers,
362            flags,
363            reserved,
364        })
365    }
366
367    /// Writes the header to a writer
368    ///
369    /// This function serializes the header structure into a 32-byte buffer and writes
370    /// it to the provided writer.
371    ///
372    /// # Parameters
373    ///
374    /// * `writer` - Any type that implements the `Write` trait
375    ///
376    /// # Returns
377    ///
378    /// * `Result<()>` - Success if the header was written
379    ///
380    /// # Errors
381    ///
382    /// * IO errors if writing to the writer fails
383    pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
384        let mut buffer = [0u8; SIZE_HEADER];
385        LittleEndian::write_u32(&mut buffer[0..4], self.magic);
386        buffer[4] = self.format;
387        LittleEndian::write_u64(&mut buffer[5..13], self.block);
388        buffer[13] = self.qual.into();
389        buffer[14] = self.compressed.into();
390        buffer[15] = self.paired.into();
391        buffer[16] = self.bits.into();
392        buffer[17] = self.headers.into();
393        buffer[18] = self.flags.into();
394        buffer[19..32].copy_from_slice(&self.reserved);
395        writer.write_all(&buffer)?;
396        Ok(())
397    }
398
399    /// Reads a header from a reader
400    ///
401    /// This function reads 32 bytes from the provided reader and parses them into
402    /// a `VBinseqHeader` structure.
403    ///
404    /// # Parameters
405    ///
406    /// * `reader` - Any type that implements the `Read` trait
407    ///
408    /// # Returns
409    ///
410    /// * `Result<Self>` - A valid header if reading and parsing was successful
411    ///
412    /// # Errors
413    ///
414    /// * IO errors if reading from the reader fails
415    /// * Header validation errors from `from_bytes()`
416    pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
417        let mut buffer = [0u8; SIZE_HEADER];
418        reader.read_exact(&mut buffer)?;
419        Self::from_bytes(&buffer)
420    }
421
422    #[must_use]
423    pub fn is_paired(&self) -> bool {
424        self.paired
425    }
426}
427
428/// Block header for VBINSEQ block data
429///
430/// Each block in a VBINSEQ file is preceded by a 32-byte block header that contains
431/// information about the block including its size and the number of records it contains.
432///
433/// # Fields
434///
435/// * `magic` - Magic number to validate block integrity ("BLOCKSEQ", 8 bytes)
436/// * `size` - Actual size of the block in bytes (8 bytes)
437/// * `records` - Number of records in the block (4 bytes)
438/// * `reserved` - Reserved bytes for future extensions (12 bytes)
439#[derive(Clone, Copy, Debug)]
440pub struct BlockHeader {
441    /// Magic number to identify the block ("BLOCKSEQ")
442    ///
443    /// Always set to 0x5145534B434F4C42 (8 bytes)
444    pub magic: u64,
445
446    /// Actual size of the block in bytes
447    ///
448    /// This can differ from the virtual block size in the file header
449    /// when compression is enabled (8 bytes)
450    pub size: u64,
451
452    /// Number of records stored in this block
453    ///
454    /// Used to iterate through records efficiently (4 bytes)
455    pub records: u32,
456
457    /// Reserved bytes for future extensions
458    ///
459    /// Currently filled with placeholder values (12 bytes)
460    pub reserved: [u8; 12],
461}
462impl BlockHeader {
463    /// Creates a new block header
464    ///
465    /// # Parameters
466    ///
467    /// * `size` - The actual size of the block in bytes (can be compressed size)
468    /// * `records` - The number of records contained in the block
469    ///
470    /// # Example
471    ///
472    /// ```rust
473    /// use binseq::vbq::BlockHeader;
474    ///
475    /// // Create a block header for a block with 1024 bytes and 100 records
476    /// let header = BlockHeader::new(1024, 100);
477    /// ```
478    #[must_use]
479    pub fn new(size: u64, records: u32) -> Self {
480        Self {
481            magic: BLOCK_MAGIC,
482            size,
483            records,
484            reserved: RESERVED_BYTES_BLOCK,
485        }
486    }
487
488    #[must_use]
489    pub fn empty() -> Self {
490        Self {
491            magic: BLOCK_MAGIC,
492            size: 0,
493            records: 0,
494            reserved: RESERVED_BYTES_BLOCK,
495        }
496    }
497
498    #[must_use]
499    pub fn is_empty(&self) -> bool {
500        self.size == 0 && self.records == 0
501    }
502
503    /// Writes the block header to a writer
504    ///
505    /// This function serializes the block header structure into a 32-byte buffer and writes
506    /// it to the provided writer.
507    ///
508    /// # Parameters
509    ///
510    /// * `writer` - Any type that implements the `Write` trait
511    ///
512    /// # Returns
513    ///
514    /// * `Result<()>` - Success if the header was written
515    ///
516    /// # Errors
517    ///
518    /// * IO errors if writing to the writer fails
519    pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
520        let mut buffer = [0u8; SIZE_BLOCK_HEADER];
521        LittleEndian::write_u64(&mut buffer[0..8], self.magic);
522        LittleEndian::write_u64(&mut buffer[8..16], self.size);
523        LittleEndian::write_u32(&mut buffer[16..20], self.records);
524        buffer[20..].copy_from_slice(&self.reserved);
525        writer.write_all(&buffer)?;
526        Ok(())
527    }
528
529    /// Creates a block header from a 32-byte buffer
530    ///
531    /// This function parses a raw byte buffer into a `BlockHeader` structure,
532    /// validating the magic number.
533    ///
534    /// # Parameters
535    ///
536    /// * `buffer` - A 32-byte array containing the block header data
537    ///
538    /// # Returns
539    ///
540    /// * `Result<Self>` - A valid block header if parsing was successful
541    ///
542    /// # Errors
543    ///
544    /// * `ReadError::InvalidBlockMagicNumber` - If the magic number doesn't match "BLOCKSEQ"
545    pub fn from_bytes(buffer: &[u8; SIZE_BLOCK_HEADER]) -> Result<Self> {
546        let magic = LittleEndian::read_u64(&buffer[0..8]);
547        if magic != BLOCK_MAGIC {
548            return Err(ReadError::InvalidBlockMagicNumber(magic, 0).into());
549        }
550        let size = LittleEndian::read_u64(&buffer[8..16]);
551        let records = LittleEndian::read_u32(&buffer[16..20]);
552        Ok(Self::new(size, records))
553    }
554
555    #[must_use]
556    pub fn size_with_header(&self) -> usize {
557        self.size as usize + SIZE_BLOCK_HEADER
558    }
559}