tinyquant_io/codec_file/
reader.rs1use crate::codec_file::header::{decode_header, CorpusFileHeader};
4use crate::compressed_vector::from_bytes;
5use crate::errors::IoError;
6use std::io::{Read, Seek, SeekFrom};
7use tinyquant_core::codec::CompressedVector;
8
9pub struct CodecFileReader<R: Read + Seek> {
13 inner: R,
14 header: CorpusFileHeader,
15 records_read: u64,
16}
17
18impl<R: Read + Seek> CodecFileReader<R> {
19 pub fn new(mut inner: R) -> Result<Self, IoError> {
29 let header = read_and_decode_header(&mut inner)?;
30 let body_offset = header.body_offset;
31 inner.seek(SeekFrom::Start(
32 u64::try_from(body_offset).map_err(|_| IoError::InvalidHeader)?,
33 ))?;
34 Ok(Self {
35 inner,
36 header,
37 records_read: 0,
38 })
39 }
40
41 pub const fn header(&self) -> &CorpusFileHeader {
43 &self.header
44 }
45
46 pub fn next_vector(&mut self) -> Result<Option<CompressedVector>, IoError> {
54 if self.records_read >= self.header.vector_count {
55 return Ok(None);
56 }
57 let cv = read_record(&mut self.inner)?;
58 self.records_read += 1;
59 Ok(Some(cv))
60 }
61
62 pub const fn records_read(&self) -> u64 {
64 self.records_read
65 }
66}
67
68fn read_and_decode_header<R: Read + Seek>(r: &mut R) -> Result<CorpusFileHeader, IoError> {
70 let mut fixed = [0u8; 24];
72 r.read_exact(&mut fixed)?;
73
74 let chl_bytes: [u8; 2] = fixed
76 .get(22..24)
77 .ok_or(IoError::Truncated {
78 needed: 24,
79 got: fixed.len(),
80 })?
81 .try_into()
82 .map_err(|_| IoError::InvalidHeader)?;
83 let config_hash_len = u16::from_le_bytes(chl_bytes) as usize;
84
85 let mut var_prefix = vec![0u8; config_hash_len + 4];
87 r.read_exact(&mut var_prefix)?;
88
89 let ml_bytes: [u8; 4] = var_prefix
91 .get(config_hash_len..config_hash_len + 4)
92 .ok_or(IoError::Truncated {
93 needed: config_hash_len + 4,
94 got: var_prefix.len(),
95 })?
96 .try_into()
97 .map_err(|_| IoError::InvalidHeader)?;
98 let metadata_len = u32::from_le_bytes(ml_bytes) as usize;
99
100 let header_end = 24 + config_hash_len + 4 + metadata_len;
102 let body_offset = ((header_end + 7) / 8) * 8;
103 let remaining_header = body_offset - 24 - config_hash_len - 4;
104 let mut rest = vec![0u8; remaining_header];
105 r.read_exact(&mut rest)?;
106
107 let mut full = Vec::with_capacity(body_offset);
109 full.extend_from_slice(&fixed);
110 full.extend_from_slice(&var_prefix);
111 full.extend_from_slice(&rest);
112
113 decode_header(&full)
114}
115
116fn read_record<R: Read>(r: &mut R) -> Result<CompressedVector, IoError> {
118 let mut len_buf = [0u8; 4];
119 r.read_exact(&mut len_buf)?;
120 let record_len = u32::from_le_bytes(len_buf) as usize;
121
122 let mut payload = vec![0u8; record_len];
123 r.read_exact(&mut payload)?;
124
125 from_bytes(&payload)
126}