Skip to main content

tinyquant_io/codec_file/
writer.rs

1//! Level-2 TQCV corpus file writer.
2//!
3//! Protocol:
4//!
5//! 1. [`CodecFileWriter::create`] — open file, write TQCX header (tentative).
6//! 2. [`CodecFileWriter::append`] — write one record (4-byte length prefix + Level-1 payload).
7//! 3. [`CodecFileWriter::finalize`] — back-patch `vector_count` and flip magic to TQCV.
8
9use crate::codec_file::header::{encode_header, FIXED_HEADER_SIZE, MAGIC_FINAL};
10use crate::compressed_vector::to_bytes;
11use crate::errors::IoError;
12use std::fs::{File, OpenOptions};
13use std::io::{Seek, SeekFrom, Write};
14use std::path::Path;
15use tinyquant_core::codec::CompressedVector;
16
17/// Streaming writer for a Level-2 TQCV corpus file.
18///
19/// The file is opened with TQCX (tentative) magic and finalized to TQCV
20/// once all records have been written and the count is known.
21pub struct CodecFileWriter {
22    file: File,
23    vector_count: u64,
24    config_hash: String,
25    dimension: u32,
26    bit_width: u8,
27    residual: bool,
28}
29
30impl CodecFileWriter {
31    /// Create a new TQCV corpus file at `path` and write the tentative header.
32    ///
33    /// # Errors
34    ///
35    /// Returns [`IoError`] if the file cannot be created, or if any header
36    /// field value is invalid.
37    pub fn create(
38        path: &Path,
39        config_hash: &str,
40        dimension: u32,
41        bit_width: u8,
42        residual: bool,
43        metadata: &[u8],
44    ) -> Result<Self, IoError> {
45        let header = encode_header(config_hash, dimension, bit_width, residual, metadata, 0)?;
46        let mut file = OpenOptions::new()
47            .write(true)
48            .create(true)
49            .truncate(true)
50            .open(path)?;
51        file.write_all(&header)?;
52        // Sync after writing the tentative header.
53        file.sync_data()?;
54        Ok(Self {
55            file,
56            vector_count: 0,
57            config_hash: config_hash.to_owned(),
58            dimension,
59            bit_width,
60            residual,
61        })
62    }
63
64    /// Append one [`CompressedVector`] as a length-prefixed Level-1 record.
65    ///
66    /// # Errors
67    ///
68    /// Returns [`IoError`] if the vector cannot be serialized or written.
69    pub fn append(&mut self, cv: &CompressedVector) -> Result<(), IoError> {
70        let payload = to_bytes(cv);
71        #[allow(clippy::cast_possible_truncation)]
72        let record_len = payload.len() as u32;
73        self.file.write_all(&record_len.to_le_bytes())?;
74        self.file.write_all(&payload)?;
75        self.vector_count += 1;
76        Ok(())
77    }
78
79    /// Finalize the file: sync body, back-patch `vector_count`, flip magic to TQCV, sync again.
80    ///
81    /// After calling `finalize`, the writer is consumed and the file is closed.
82    ///
83    /// # Errors
84    ///
85    /// Returns [`IoError`] if any seek, write, or sync operation fails.
86    pub fn finalize(mut self) -> Result<(), IoError> {
87        // 1. Sync the body data so all appended records are durable.
88        self.file.sync_data()?;
89
90        // 2. Back-patch vector_count at offset 8 (u64 LE).
91        self.file.seek(SeekFrom::Start(8))?;
92        self.file.write_all(&self.vector_count.to_le_bytes())?;
93
94        // 3. Sync vector_count before writing the magic flip. This ordering
95        //    guarantee is critical: a crash between steps 2 and 4 must leave
96        //    the file in a state the reader will reject (TQCX magic), never
97        //    in a state with TQCV but an un-synced count.
98        self.file.sync_data()?;
99
100        // 4. Flip magic from TQCX to TQCV at offset 0 — the linearization
101        //    point. After this write the file is visible to readers.
102        self.file.seek(SeekFrom::Start(0))?;
103        self.file.write_all(MAGIC_FINAL)?;
104
105        // 5. Final sync — file is now readable and fully durable.
106        self.file.sync_data()?;
107        Ok(())
108    }
109
110    /// Number of vectors written so far.
111    pub const fn vector_count(&self) -> u64 {
112        self.vector_count
113    }
114
115    /// Config hash used when creating the file.
116    pub fn config_hash(&self) -> &str {
117        &self.config_hash
118    }
119
120    /// Dimension declared in the header.
121    pub const fn dimension(&self) -> u32 {
122        self.dimension
123    }
124
125    /// Bit width declared in the header.
126    pub const fn bit_width(&self) -> u8 {
127        self.bit_width
128    }
129
130    /// Whether the file was created with the residual flag set.
131    pub const fn residual(&self) -> bool {
132        self.residual
133    }
134
135    /// Compute the byte offset of the first record body in the file.
136    ///
137    /// This mirrors the `body_offset` formula from [`encode_header`]: the
138    /// header is padded to an 8-byte boundary after the variable prefix.
139    ///
140    /// # Errors
141    ///
142    /// Returns [`IoError::InvalidHeader`] if the parameters would produce
143    /// an invalid header (same constraints as [`encode_header`]).
144    // Cannot be const because Err()/Ok() in const fns require Rust >= 1.83.
145    #[allow(clippy::missing_const_for_fn)]
146    pub fn body_offset(config_hash: &str, metadata_len: usize) -> Result<usize, IoError> {
147        let hash_len = config_hash.as_bytes().len();
148        if hash_len > 256 {
149            return Err(IoError::InvalidHeader);
150        }
151        let header_end = FIXED_HEADER_SIZE + hash_len + 4 + metadata_len;
152        Ok(((header_end + 7) / 8) * 8)
153    }
154}