Skip to main content

inputx_dict_format/
codec.rs

1//! IDFv1 binary layout — header / entry / flag types + LE codec helpers.
2//!
3//! Wire layout per `.claude/PLAN-dict-format-IDFv1.md`. Little-endian
4//! throughout (x86_64 + arm64 are both LE; cross-platform safe).
5//! Alignment-strict: section boundaries are 8-byte aligned; `Entry` is
6//! exactly 16 bytes packed.
7
8use core::convert::TryInto;
9
10/// Magic bytes at file offset 0. ASCII `"IDFv"`.
11pub const MAGIC: [u8; 4] = *b"IDFv";
12
13/// Fixed header size in bytes. Sections begin at `HEADER_SIZE`.
14pub const HEADER_SIZE: usize = 64;
15
16/// Fixed per-entry size in bytes.
17pub const ENTRY_SIZE: usize = 16;
18
19/// Format version. v1 = the layout described in
20/// `.claude/PLAN-dict-format-IDFv1.md`. Future v2+ will live alongside
21/// via the `format_version` header byte; v1 readers MUST reject unknown
22/// versions with a clear error.
23#[repr(u8)]
24#[derive(Copy, Clone, Debug, PartialEq, Eq)]
25pub enum Version {
26    V1 = 1,
27}
28
29impl Version {
30    /// Decode the on-disk byte. Returns `None` for unsupported versions
31    /// (the caller emits a clear "v2 file opened by v1 reader" error).
32    pub fn from_byte(b: u8) -> Option<Self> {
33        match b {
34            1 => Some(Self::V1),
35            _ => None,
36        }
37    }
38}
39
40/// Which engine the dict serves. Stable u8 across versions so dispatch
41/// code can match without translation.
42#[repr(u8)]
43#[derive(Copy, Clone, Debug, PartialEq, Eq)]
44pub enum EngineKind {
45    Pinyin = 0,
46    Wubi = 1,
47    NihongoJukugo = 2,
48    NihongoKanji = 3,
49    Other = 4,
50}
51
52impl EngineKind {
53    pub fn from_byte(b: u8) -> Option<Self> {
54        match b {
55            0 => Some(Self::Pinyin),
56            1 => Some(Self::Wubi),
57            2 => Some(Self::NihongoJukugo),
58            3 => Some(Self::NihongoKanji),
59            4 => Some(Self::Other),
60            _ => None,
61        }
62    }
63}
64
65/// Per-entry flag bits. Bit assignments are stable across IDFv1.
66#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
67pub struct EntryFlags(pub u8);
68
69impl EntryFlags {
70    pub const BLACKLIST: u8 = 1 << 0;
71    pub const CURATED_OVERRIDE: u8 = 1 << 1;
72    pub const USER_ADDED: u8 = 1 << 2;
73    /// Bits 5-7 carry an engine-specific 3-bit enum payload (0-7). For
74    /// wubi entries (added v1.4.7 sub-phase A4 step 2): the
75    /// `inputx_wubi::Layer` enum index, so cement-side fills can
76    /// recover (word, layer, raw_freq) tuples without re-reading the
77    /// facade dict. For non-wubi engines it stays zero. Decoders
78    /// querying this should pair it with the IDF's `engine_kind`
79    /// header byte; the field is otherwise just opaque bits.
80    pub const ENGINE_TAG_MASK: u8 = 0b1110_0000;
81    pub const ENGINE_TAG_SHIFT: u8 = 5;
82
83    pub fn is_blacklisted(self) -> bool { self.0 & Self::BLACKLIST != 0 }
84    pub fn is_curated_override(self) -> bool { self.0 & Self::CURATED_OVERRIDE != 0 }
85    pub fn is_user_added(self) -> bool { self.0 & Self::USER_ADDED != 0 }
86    /// Return the 3-bit engine-specific tag (bits 5-7). For wubi this
87    /// is the `Layer` enum's `as_index()`.
88    pub fn engine_tag(self) -> u8 {
89        (self.0 & Self::ENGINE_TAG_MASK) >> Self::ENGINE_TAG_SHIFT
90    }
91    /// Set the 3-bit engine-specific tag while preserving the other
92    /// bits. `tag` is clamped to 3 bits (`& 0b111`); callers passing
93    /// out-of-range values lose the upper bits silently — keep the
94    /// caller's enum strictly ≤ 7.
95    pub fn with_engine_tag(self, tag: u8) -> Self {
96        let cleared = self.0 & !Self::ENGINE_TAG_MASK;
97        Self(cleared | ((tag & 0b111) << Self::ENGINE_TAG_SHIFT))
98    }
99}
100
101/// Header, mirrors the on-disk 64-byte layout exactly. All integer
102/// fields are little-endian on disk; the public struct holds host-byte
103/// values (decoded on `parse`, encoded on `to_bytes`).
104#[derive(Copy, Clone, Debug, PartialEq, Eq)]
105pub struct Header {
106    pub magic: [u8; 4],
107    pub format_version: u8,
108    pub engine_kind: u8,
109    pub flags: u16,
110    pub entry_count: u32,
111    pub string_pool_offset: u32,
112    pub string_pool_size: u32,
113    pub entry_table_offset: u32,
114    pub fst_code_index_offset: u32,
115    pub fst_code_index_size: u32,
116    pub fst_word_index_offset: u32,
117    pub fst_word_index_size: u32,
118    pub bigram_offset: u32,
119    pub bigram_size: u32,
120    pub embedding_offset: u32,
121    pub embedding_dim: u16,
122    pub embedding_dtype: u8,
123    pub reserved: u8,
124    pub sha256_of_payload: [u8; 32],
125}
126
127impl Header {
128    /// Encode the header into a fixed 64-byte buffer (little-endian).
129    pub fn to_bytes(&self) -> [u8; HEADER_SIZE] {
130        let mut buf = [0u8; HEADER_SIZE];
131        buf[0..4].copy_from_slice(&self.magic);
132        buf[4] = self.format_version;
133        buf[5] = self.engine_kind;
134        buf[6..8].copy_from_slice(&self.flags.to_le_bytes());
135        buf[8..12].copy_from_slice(&self.entry_count.to_le_bytes());
136        buf[12..16].copy_from_slice(&self.string_pool_offset.to_le_bytes());
137        buf[16..20].copy_from_slice(&self.string_pool_size.to_le_bytes());
138        buf[20..24].copy_from_slice(&self.entry_table_offset.to_le_bytes());
139        buf[24..28].copy_from_slice(&self.fst_code_index_offset.to_le_bytes());
140        buf[28..32].copy_from_slice(&self.fst_code_index_size.to_le_bytes());
141        buf[32..36].copy_from_slice(&self.fst_word_index_offset.to_le_bytes());
142        buf[36..40].copy_from_slice(&self.fst_word_index_size.to_le_bytes());
143        buf[40..44].copy_from_slice(&self.bigram_offset.to_le_bytes());
144        buf[44..48].copy_from_slice(&self.bigram_size.to_le_bytes());
145        buf[48..52].copy_from_slice(&self.embedding_offset.to_le_bytes());
146        buf[52..54].copy_from_slice(&self.embedding_dim.to_le_bytes());
147        buf[54] = self.embedding_dtype;
148        buf[55] = self.reserved;
149        // sha256 trails — 8 bytes worth of layout used (54+1+1+8=64? let's
150        // recount: header is exactly 64. We've used 0..56 for non-sha
151        // fields. sha256 (32B) needs 32 — that's 88 total which OVERFLOWS.
152        // The spec keeps the header at 64 by NOT counting the sha256 in
153        // the header proper — it sits immediately after at offset 64. But
154        // PLAN-dict-format-IDFv1.md says `sha256_of_payload` IS in the
155        // header. Reconcile: bytes 56..88 (32 B) live in a 96-byte header
156        // region. We treat HEADER_SIZE as 64 + 32 = 96 conceptually but
157        // keep HEADER_SIZE = 64 = "everything BEFORE sha256". The full
158        // file region taken by header+sha256 is HEADER_SIZE + 32 = 96.
159        // sha256 is written separately by the writer after computing it
160        // over the payload. Return only the first 64.
161        buf
162    }
163
164    /// Decode a header from on-disk bytes. Returns `None` if `buf` is
165    /// too small or the magic does not match.
166    pub fn parse(buf: &[u8]) -> Option<Self> {
167        if buf.len() < HEADER_SIZE + 32 { return None; }
168        if buf[0..4] != MAGIC { return None; }
169        let mut sha = [0u8; 32];
170        sha.copy_from_slice(&buf[HEADER_SIZE..HEADER_SIZE + 32]);
171        Some(Self {
172            magic: MAGIC,
173            format_version: buf[4],
174            engine_kind: buf[5],
175            flags: u16::from_le_bytes(buf[6..8].try_into().ok()?),
176            entry_count: u32::from_le_bytes(buf[8..12].try_into().ok()?),
177            string_pool_offset: u32::from_le_bytes(buf[12..16].try_into().ok()?),
178            string_pool_size: u32::from_le_bytes(buf[16..20].try_into().ok()?),
179            entry_table_offset: u32::from_le_bytes(buf[20..24].try_into().ok()?),
180            fst_code_index_offset: u32::from_le_bytes(buf[24..28].try_into().ok()?),
181            fst_code_index_size: u32::from_le_bytes(buf[28..32].try_into().ok()?),
182            fst_word_index_offset: u32::from_le_bytes(buf[32..36].try_into().ok()?),
183            fst_word_index_size: u32::from_le_bytes(buf[36..40].try_into().ok()?),
184            bigram_offset: u32::from_le_bytes(buf[40..44].try_into().ok()?),
185            bigram_size: u32::from_le_bytes(buf[44..48].try_into().ok()?),
186            embedding_offset: u32::from_le_bytes(buf[48..52].try_into().ok()?),
187            embedding_dim: u16::from_le_bytes(buf[52..54].try_into().ok()?),
188            embedding_dtype: buf[54],
189            reserved: buf[55],
190            sha256_of_payload: sha,
191        })
192    }
193}
194
195/// Reserved byte length for the sha256 region immediately after the
196/// 64-byte header proper. The full on-disk header region is
197/// `HEADER_SIZE + SHA256_SIZE = 96` bytes; sections begin at offset 96.
198pub const SHA256_SIZE: usize = 32;
199
200/// Total on-disk header region (header + sha256 area). Sections begin here.
201pub const FULL_HEADER_SIZE: usize = HEADER_SIZE + SHA256_SIZE;
202
203/// Per-entry record (16 bytes packed). `word_offset` and `code_offset`
204/// are u24 (3 bytes); they point into the string pool. `log_prior` is
205/// signed Q4 fixed-point (one log unit per 16 integer steps, per
206/// [`inputx_scoring::Q4`]). `raw_freq` is the original pre-quantization
207/// corpus frequency (added v1.4.7 sub-phase A4 step 1) — it lets
208/// cement-side cement rebuild a lossless tiebreaker when two entries
209/// land in the same Q4 `log_prior` bucket (e.g. 乎/护 for code `hu`,
210/// both quantize to Q4=170; raw_freq distinguishes them).
211///
212/// Layout: `u24 word_offset + u24 code_offset + i16 log_prior + u8
213/// match_type + u8 flags + u32 raw_freq + 2 bytes reserved = 16`.
214///
215/// `raw_freq=0` on disk is the v1.4.6-era backward-compatible default
216/// (those bytes were `bigram_offset`, never written non-zero and never
217/// read), so old .idf blobs decode as `raw_freq=0` — the only fallout
218/// is loss of the tiebreaker for legacy snapshots.
219#[derive(Copy, Clone, Debug, PartialEq, Eq)]
220pub struct EntryRecord {
221    pub word_offset: u32, // u24 on disk
222    pub code_offset: u32, // u24 on disk
223    pub log_prior: i16,
224    pub match_type: u8,
225    pub flags: u8,
226    pub raw_freq: u32,
227    pub embedding_offset: u32,
228}
229
230impl EntryRecord {
231    /// Encode to the 16-byte on-disk representation.
232    pub fn to_bytes(&self) -> [u8; ENTRY_SIZE] {
233        let mut buf = [0u8; ENTRY_SIZE];
234        // u24 little-endian = low 3 bytes
235        let wo = self.word_offset.to_le_bytes();
236        buf[0..3].copy_from_slice(&wo[0..3]);
237        let co = self.code_offset.to_le_bytes();
238        buf[3..6].copy_from_slice(&co[0..3]);
239        buf[6..8].copy_from_slice(&self.log_prior.to_le_bytes());
240        buf[8] = self.match_type;
241        buf[9] = self.flags;
242        buf[10..14].copy_from_slice(&self.raw_freq.to_le_bytes());
243        // Bytes 14..16 reserved. Per-entry embedding_offset lives in a
244        // header-referenced side table (v2 extension); v1 leaves these
245        // zero. No write needed — buf already zero.
246        buf
247    }
248
249    /// Decode from 16-byte on-disk representation.
250    pub fn parse(buf: &[u8; ENTRY_SIZE]) -> Self {
251        let mut wo = [0u8; 4];
252        wo[0..3].copy_from_slice(&buf[0..3]);
253        let word_offset = u32::from_le_bytes(wo);
254        let mut co = [0u8; 4];
255        co[0..3].copy_from_slice(&buf[3..6]);
256        let code_offset = u32::from_le_bytes(co);
257        let log_prior = i16::from_le_bytes([buf[6], buf[7]]);
258        let match_type = buf[8];
259        let flags = buf[9];
260        let raw_freq = u32::from_le_bytes([buf[10], buf[11], buf[12], buf[13]]);
261        EntryRecord {
262            word_offset,
263            code_offset,
264            log_prior,
265            match_type,
266            flags,
267            raw_freq,
268            embedding_offset: 0, // v1: side table, not per-entry
269        }
270    }
271}
272
273/// Encode an [`inputx_scoring::MatchType`] into the single u8 stored in
274/// `EntryRecord::match_type`. Round-trippable via [`decode_match_type`].
275/// Inline payload (proximity / fuzzy cost / bigram_links) is lost on
276/// encode — the writer uses `Exact` for entries that have a fixed
277/// dict-baseline classification; runtime paths attach the inline payload
278/// based on how the buffer matched.
279pub fn encode_match_type(mt: inputx_scoring::MatchType) -> u8 {
280    match mt {
281        inputx_scoring::MatchType::Exact => 0,
282        inputx_scoring::MatchType::Prefix(_) => 1,
283        inputx_scoring::MatchType::Fuzzy(_) => 2,
284        inputx_scoring::MatchType::Composed { .. } => 3,
285    }
286}
287
288/// Decode `EntryRecord::match_type` back to [`inputx_scoring::MatchType`].
289/// Inline payload fields are zeroed; callers attach runtime values.
290pub fn decode_match_type(b: u8) -> inputx_scoring::MatchType {
291    match b {
292        0 => inputx_scoring::MatchType::Exact,
293        1 => inputx_scoring::MatchType::Prefix(0),
294        2 => inputx_scoring::MatchType::Fuzzy(0),
295        3 => inputx_scoring::MatchType::Composed { bigram_links: 0 },
296        _ => inputx_scoring::MatchType::Exact, // forward-compat: unknown → Exact
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303
304    #[test]
305    fn header_size_constants_match_spec() {
306        assert_eq!(HEADER_SIZE, 64);
307        assert_eq!(SHA256_SIZE, 32);
308        assert_eq!(FULL_HEADER_SIZE, 96);
309        assert_eq!(ENTRY_SIZE, 16);
310    }
311
312    #[test]
313    fn header_round_trip_preserves_all_fields() {
314        let h = Header {
315            magic: MAGIC,
316            format_version: 1,
317            engine_kind: 2,
318            flags: 0x0007,
319            entry_count: 237_842,
320            string_pool_offset: 96,
321            string_pool_size: 2_097_152,
322            entry_table_offset: 2_097_248,
323            fst_code_index_offset: 5_900_000,
324            fst_code_index_size: 1_048_576,
325            fst_word_index_offset: 6_948_576,
326            fst_word_index_size: 524_288,
327            bigram_offset: 0,
328            bigram_size: 0,
329            embedding_offset: 0,
330            embedding_dim: 0,
331            embedding_dtype: 0,
332            reserved: 0,
333            sha256_of_payload: [0xab; 32],
334        };
335        let bytes = h.to_bytes();
336        // Pad bytes 64..96 with the sha256 trailer to make Header::parse
337        // happy (parse requires at least 96 bytes).
338        let mut full = [0u8; FULL_HEADER_SIZE];
339        full[..HEADER_SIZE].copy_from_slice(&bytes);
340        full[HEADER_SIZE..].copy_from_slice(&h.sha256_of_payload);
341        let h2 = Header::parse(&full).expect("parse");
342        assert_eq!(h2, h);
343    }
344
345    #[test]
346    fn header_rejects_wrong_magic() {
347        let mut buf = [0u8; FULL_HEADER_SIZE];
348        buf[0..4].copy_from_slice(b"WHAT");
349        assert!(Header::parse(&buf).is_none());
350    }
351
352    #[test]
353    fn header_rejects_short_buffer() {
354        let buf = [0u8; HEADER_SIZE]; // missing sha256 trailer
355        assert!(Header::parse(&buf).is_none());
356    }
357
358    #[test]
359    fn version_byte_round_trip() {
360        assert_eq!(Version::from_byte(1), Some(Version::V1));
361        assert_eq!(Version::from_byte(2), None);
362        assert_eq!(Version::from_byte(0), None);
363    }
364
365    #[test]
366    fn engine_kind_byte_round_trip() {
367        for k in [
368            EngineKind::Pinyin,
369            EngineKind::Wubi,
370            EngineKind::NihongoJukugo,
371            EngineKind::NihongoKanji,
372            EngineKind::Other,
373        ] {
374            assert_eq!(EngineKind::from_byte(k as u8), Some(k));
375        }
376        assert_eq!(EngineKind::from_byte(99), None);
377    }
378
379    #[test]
380    fn entry_round_trip_preserves_fields() {
381        let e = EntryRecord {
382            word_offset: 0x12_3456,
383            code_offset: 0xab_cdef,
384            log_prior: -42,
385            match_type: 1,
386            flags: EntryFlags::BLACKLIST | EntryFlags::USER_ADDED,
387            raw_freq: 0xdead_beef,
388            embedding_offset: 0,
389        };
390        let bytes = e.to_bytes();
391        assert_eq!(bytes.len(), ENTRY_SIZE);
392        let e2 = EntryRecord::parse(&bytes);
393        assert_eq!(e2, e);
394    }
395
396    #[test]
397    fn entry_u24_offsets_truncate_at_24_bits() {
398        // u24 max = 0xFFFFFF (16,777,215). String pool may exceed this
399        // in theory (e.g., very large embeddings ship). The writer must
400        // detect overflow; on the decode side we only check round-trip
401        // within the u24 range.
402        let e = EntryRecord {
403            word_offset: 0xFF_FFFF,
404            code_offset: 0,
405            log_prior: 0,
406            match_type: 0,
407            flags: 0,
408            raw_freq: 0,
409            embedding_offset: 0,
410        };
411        let bytes = e.to_bytes();
412        let e2 = EntryRecord::parse(&bytes);
413        assert_eq!(e2.word_offset, 0xFF_FFFF);
414    }
415
416    #[test]
417    fn match_type_round_trip() {
418        for mt in [
419            inputx_scoring::MatchType::Exact,
420            inputx_scoring::MatchType::Prefix(800),
421            inputx_scoring::MatchType::Fuzzy(300),
422            inputx_scoring::MatchType::Composed { bigram_links: 2 },
423        ] {
424            let b = encode_match_type(mt);
425            let back = decode_match_type(b);
426            // Inline payload not preserved — only variant tag round-trips.
427            assert_eq!(
428                core::mem::discriminant(&back),
429                core::mem::discriminant(&mt),
430                "variant {mt:?} → byte {b} → {back:?} (variant must match)"
431            );
432        }
433        // Unknown byte falls back to Exact (forward-compat).
434        assert_eq!(decode_match_type(99), inputx_scoring::MatchType::Exact);
435    }
436}