Skip to main content

tinyquant_io/codec_file/
reader.rs

1//! Level-2 TQCV corpus file streaming reader (non-mmap).
2
3use crate::codec_file::header::{decode_header, CorpusFileHeader};
4use crate::compressed_vector::from_bytes;
5use crate::errors::IoError;
6use std::io::{Read, Seek, SeekFrom};
7use tinyquant_core::codec::CompressedVector;
8
9/// Streaming reader for a Level-2 TQCV corpus file.
10///
11/// Reads records sequentially without memory-mapping the file.
12pub struct CodecFileReader<R: Read + Seek> {
13    inner: R,
14    header: CorpusFileHeader,
15    records_read: u64,
16}
17
18impl<R: Read + Seek> CodecFileReader<R> {
19    /// Open a TQCV corpus file from a seekable reader.
20    ///
21    /// Reads and validates the Level-2 header, then positions the reader
22    /// at the start of the body.
23    ///
24    /// # Errors
25    ///
26    /// Returns [`IoError`] if the header is malformed, the magic is wrong,
27    /// or the underlying I/O fails.
28    pub fn new(mut inner: R) -> Result<Self, IoError> {
29        let header = read_and_decode_header(&mut inner)?;
30        let body_offset = header.body_offset;
31        inner.seek(SeekFrom::Start(
32            u64::try_from(body_offset).map_err(|_| IoError::InvalidHeader)?,
33        ))?;
34        Ok(Self {
35            inner,
36            header,
37            records_read: 0,
38        })
39    }
40
41    /// Return a reference to the decoded Level-2 header.
42    pub const fn header(&self) -> &CorpusFileHeader {
43        &self.header
44    }
45
46    /// Read the next [`CompressedVector`] from the file.
47    ///
48    /// Returns `Ok(None)` when all `vector_count` records have been read.
49    ///
50    /// # Errors
51    ///
52    /// Returns [`IoError`] if the record is malformed or I/O fails.
53    pub fn next_vector(&mut self) -> Result<Option<CompressedVector>, IoError> {
54        if self.records_read >= self.header.vector_count {
55            return Ok(None);
56        }
57        let cv = read_record(&mut self.inner)?;
58        self.records_read += 1;
59        Ok(Some(cv))
60    }
61
62    /// Number of records read so far.
63    pub const fn records_read(&self) -> u64 {
64        self.records_read
65    }
66}
67
68/// Read and validate the Level-2 header from `r`.
69fn read_and_decode_header<R: Read + Seek>(r: &mut R) -> Result<CorpusFileHeader, IoError> {
70    // Read the fixed 24-byte header first to find config_hash_len.
71    let mut fixed = [0u8; 24];
72    r.read_exact(&mut fixed)?;
73
74    // Peek at config_hash_len (bytes 22..24)
75    let chl_bytes: [u8; 2] = fixed
76        .get(22..24)
77        .ok_or(IoError::Truncated {
78            needed: 24,
79            got: fixed.len(),
80        })?
81        .try_into()
82        .map_err(|_| IoError::InvalidHeader)?;
83    let config_hash_len = u16::from_le_bytes(chl_bytes) as usize;
84
85    // Read config_hash + 4-byte metadata_len
86    let mut var_prefix = vec![0u8; config_hash_len + 4];
87    r.read_exact(&mut var_prefix)?;
88
89    // Peek at metadata_len
90    let ml_bytes: [u8; 4] = var_prefix
91        .get(config_hash_len..config_hash_len + 4)
92        .ok_or(IoError::Truncated {
93            needed: config_hash_len + 4,
94            got: var_prefix.len(),
95        })?
96        .try_into()
97        .map_err(|_| IoError::InvalidHeader)?;
98    let metadata_len = u32::from_le_bytes(ml_bytes) as usize;
99
100    // Read metadata + alignment padding
101    let header_end = 24 + config_hash_len + 4 + metadata_len;
102    let body_offset = ((header_end + 7) / 8) * 8;
103    let remaining_header = body_offset - 24 - config_hash_len - 4;
104    let mut rest = vec![0u8; remaining_header];
105    r.read_exact(&mut rest)?;
106
107    // Reassemble for decode_header
108    let mut full = Vec::with_capacity(body_offset);
109    full.extend_from_slice(&fixed);
110    full.extend_from_slice(&var_prefix);
111    full.extend_from_slice(&rest);
112
113    decode_header(&full)
114}
115
116/// Read one length-prefixed Level-1 record from `r`.
117fn read_record<R: Read>(r: &mut R) -> Result<CompressedVector, IoError> {
118    let mut len_buf = [0u8; 4];
119    r.read_exact(&mut len_buf)?;
120    let record_len = u32::from_le_bytes(len_buf) as usize;
121
122    let mut payload = vec![0u8; record_len];
123    r.read_exact(&mut payload)?;
124
125    from_bytes(&payload)
126}