tinyquant_io/codec_file/writer.rs
1//! Level-2 TQCV corpus file writer.
2//!
3//! Protocol:
4//!
5//! 1. [`CodecFileWriter::create`] — open file, write TQCX header (tentative).
6//! 2. [`CodecFileWriter::append`] — write one record (4-byte length prefix + Level-1 payload).
7//! 3. [`CodecFileWriter::finalize`] — back-patch `vector_count` and flip magic to TQCV.
8
9use crate::codec_file::header::{encode_header, FIXED_HEADER_SIZE, MAGIC_FINAL};
10use crate::compressed_vector::to_bytes;
11use crate::errors::IoError;
12use std::fs::{File, OpenOptions};
13use std::io::{Seek, SeekFrom, Write};
14use std::path::Path;
15use tinyquant_core::codec::CompressedVector;
16
17/// Streaming writer for a Level-2 TQCV corpus file.
18///
19/// The file is opened with TQCX (tentative) magic and finalized to TQCV
20/// once all records have been written and the count is known.
21pub struct CodecFileWriter {
22 file: File,
23 vector_count: u64,
24 config_hash: String,
25 dimension: u32,
26 bit_width: u8,
27 residual: bool,
28}
29
30impl CodecFileWriter {
31 /// Create a new TQCV corpus file at `path` and write the tentative header.
32 ///
33 /// # Errors
34 ///
35 /// Returns [`IoError`] if the file cannot be created, or if any header
36 /// field value is invalid.
37 pub fn create(
38 path: &Path,
39 config_hash: &str,
40 dimension: u32,
41 bit_width: u8,
42 residual: bool,
43 metadata: &[u8],
44 ) -> Result<Self, IoError> {
45 let header = encode_header(config_hash, dimension, bit_width, residual, metadata, 0)?;
46 let mut file = OpenOptions::new()
47 .write(true)
48 .create(true)
49 .truncate(true)
50 .open(path)?;
51 file.write_all(&header)?;
52 // Sync after writing the tentative header.
53 file.sync_data()?;
54 Ok(Self {
55 file,
56 vector_count: 0,
57 config_hash: config_hash.to_owned(),
58 dimension,
59 bit_width,
60 residual,
61 })
62 }
63
64 /// Append one [`CompressedVector`] as a length-prefixed Level-1 record.
65 ///
66 /// # Errors
67 ///
68 /// Returns [`IoError`] if the vector cannot be serialized or written.
69 pub fn append(&mut self, cv: &CompressedVector) -> Result<(), IoError> {
70 let payload = to_bytes(cv);
71 #[allow(clippy::cast_possible_truncation)]
72 let record_len = payload.len() as u32;
73 self.file.write_all(&record_len.to_le_bytes())?;
74 self.file.write_all(&payload)?;
75 self.vector_count += 1;
76 Ok(())
77 }
78
79 /// Finalize the file: sync body, back-patch `vector_count`, flip magic to TQCV, sync again.
80 ///
81 /// After calling `finalize`, the writer is consumed and the file is closed.
82 ///
83 /// # Errors
84 ///
85 /// Returns [`IoError`] if any seek, write, or sync operation fails.
86 pub fn finalize(mut self) -> Result<(), IoError> {
87 // 1. Sync the body data so all appended records are durable.
88 self.file.sync_data()?;
89
90 // 2. Back-patch vector_count at offset 8 (u64 LE).
91 self.file.seek(SeekFrom::Start(8))?;
92 self.file.write_all(&self.vector_count.to_le_bytes())?;
93
94 // 3. Sync vector_count before writing the magic flip. This ordering
95 // guarantee is critical: a crash between steps 2 and 4 must leave
96 // the file in a state the reader will reject (TQCX magic), never
97 // in a state with TQCV but an un-synced count.
98 self.file.sync_data()?;
99
100 // 4. Flip magic from TQCX to TQCV at offset 0 — the linearization
101 // point. After this write the file is visible to readers.
102 self.file.seek(SeekFrom::Start(0))?;
103 self.file.write_all(MAGIC_FINAL)?;
104
105 // 5. Final sync — file is now readable and fully durable.
106 self.file.sync_data()?;
107 Ok(())
108 }
109
110 /// Number of vectors written so far.
111 pub const fn vector_count(&self) -> u64 {
112 self.vector_count
113 }
114
115 /// Config hash used when creating the file.
116 pub fn config_hash(&self) -> &str {
117 &self.config_hash
118 }
119
120 /// Dimension declared in the header.
121 pub const fn dimension(&self) -> u32 {
122 self.dimension
123 }
124
125 /// Bit width declared in the header.
126 pub const fn bit_width(&self) -> u8 {
127 self.bit_width
128 }
129
130 /// Whether the file was created with the residual flag set.
131 pub const fn residual(&self) -> bool {
132 self.residual
133 }
134
135 /// Compute the byte offset of the first record body in the file.
136 ///
137 /// This mirrors the `body_offset` formula from [`encode_header`]: the
138 /// header is padded to an 8-byte boundary after the variable prefix.
139 ///
140 /// # Errors
141 ///
142 /// Returns [`IoError::InvalidHeader`] if the parameters would produce
143 /// an invalid header (same constraints as [`encode_header`]).
144 // Cannot be const because Err()/Ok() in const fns require Rust >= 1.83.
145 #[allow(clippy::missing_const_for_fn)]
146 pub fn body_offset(config_hash: &str, metadata_len: usize) -> Result<usize, IoError> {
147 let hash_len = config_hash.as_bytes().len();
148 if hash_len > 256 {
149 return Err(IoError::InvalidHeader);
150 }
151 let header_end = FIXED_HEADER_SIZE + hash_len + 4 + metadata_len;
152 Ok(((header_end + 7) / 8) * 8)
153 }
154}