inputx-dict-format 1.4.0

IDFv1 binary dict format for IME engines — mmap zero-copy reader + deterministic writer. Probability-native (log_prior in Q4 fixed-point), versioned, sha256-verified. Shared format across pinyin / wubi / Japanese / Korean / Vietnamese / any code-to-word IME.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! `IdfReader` — mmap zero-copy reader for IDFv1 files.
//!
//! Cold load: O(0) parse — open the file, validate the 96-byte header
//! region, set up offset views into the mmap region. Lookup: direct
//! pointer arithmetic, no allocation.

use alloc::vec::Vec;
use core::result::Result;

use crate::codec::{
    decode_match_type, EngineKind, EntryFlags, EntryRecord, Header, Version,
    FULL_HEADER_SIZE, MAGIC,
};

/// Lookup error kind. Public so callers can distinguish "file is a
/// different IDFv version" from "file is corrupt" — actionable
/// difference: a v2 file should drive a tool upgrade prompt; a corrupt
/// file should drive a rebuild.
#[derive(Debug)]
pub enum OpenError {
    /// I/O failure while reading the file. Wraps the source.
    #[cfg(feature = "std")]
    Io(std::io::Error),
    /// File is too short to hold even the header region.
    TooShort,
    /// First 4 bytes are not `b"IDFv"`. File is not an IDF dict at all.
    BadMagic,
    /// `format_version` byte is not `1`. A future v2 reader will accept
    /// these; v1 reader returns this so the caller can prompt a rebuild
    /// against the older tool.
    UnsupportedVersion(u8),
    /// `engine_kind` byte is outside the known set (per
    /// [`EngineKind::from_byte`]).
    UnknownEngineKind(u8),
    /// Header section offsets disagree with the file size — points past
    /// EOF or overlap. Indicates a truncated or rewritten file.
    CorruptOffsets,
    /// sha256_of_payload in the header does not match a fresh hash of
    /// the post-header bytes. Indicates the file was tampered with or
    /// corrupted between build and load.
    Sha256Mismatch,
}

#[cfg(feature = "std")]
impl From<std::io::Error> for OpenError {
    fn from(e: std::io::Error) -> Self { Self::Io(e) }
}

/// A read-only entry view backed by mmap (when `IdfReader` was opened
/// from a file) or by an owned byte buffer (in `no_std` use). `word`
/// and `code` borrow string-pool bytes; the lifetime ties them to the
/// underlying reader.
#[derive(Copy, Clone, Debug)]
pub struct Entry<'a> {
    pub word: &'a str,
    pub code: &'a str,
    pub log_prior: i16,
    /// Pre-quantization corpus frequency. Added v1.4.7 sub-phase A4
    /// as a lossless tiebreaker for entries that share the same Q4
    /// `log_prior` bucket — without it, cement-side cement can only
    /// recover an estimated freq via `exp(log_prior/Q4) - 1`, which
    /// collides exactly for bucket-mates. `0` on legacy / synthetic
    /// .idf blobs (the byte slot was previously an unused
    /// `bigram_offset` placeholder, always zero).
    pub raw_freq: u32,
    pub match_type: inputx_scoring::MatchType,
    pub flags: EntryFlags,
}

/// IDFv1 reader. Holds the file's byte region and the parsed header.
/// `entry_index` lookups are constant-time arithmetic; FST code/word
/// scans use `inputx_fsa` directly over the index sections.
pub struct IdfReader<B>
where
    B: AsRef<[u8]>,
{
    bytes: B,
    header: Header,
}

impl<B: AsRef<[u8]>> IdfReader<B> {
    /// Build a reader from an in-memory byte region. Validates the
    /// header and (in std builds) the sha256 of payload.
    pub fn from_bytes(bytes: B) -> Result<Self, OpenError> {
        let buf = bytes.as_ref();
        if buf.len() < FULL_HEADER_SIZE {
            return Err(OpenError::TooShort);
        }
        if buf[0..4] != MAGIC {
            return Err(OpenError::BadMagic);
        }
        let header = Header::parse(buf).ok_or(OpenError::BadMagic)?;
        if Version::from_byte(header.format_version).is_none() {
            return Err(OpenError::UnsupportedVersion(header.format_version));
        }
        if EngineKind::from_byte(header.engine_kind).is_none() {
            return Err(OpenError::UnknownEngineKind(header.engine_kind));
        }
        // Bounds-check the section offsets against the file size — any
        // off-by-N or attacker-tampered file slips out here.
        let file_len = buf.len() as u32;
        for (off, sz) in [
            (header.string_pool_offset, header.string_pool_size),
            (
                header.entry_table_offset,
                header.entry_count.saturating_mul(crate::codec::ENTRY_SIZE as u32),
            ),
            (header.fst_code_index_offset, header.fst_code_index_size),
            (header.fst_word_index_offset, header.fst_word_index_size),
        ] {
            if off > file_len || off.saturating_add(sz) > file_len {
                return Err(OpenError::CorruptOffsets);
            }
        }
        // Verify sha256 of payload (covers everything after the sha256
        // field; offset FULL_HEADER_SIZE .. file_end).
        #[cfg(feature = "std")]
        {
            use sha2::{Digest, Sha256};
            let mut hasher = Sha256::new();
            hasher.update(&buf[FULL_HEADER_SIZE..]);
            let got: [u8; 32] = hasher.finalize().into();
            if got != header.sha256_of_payload {
                return Err(OpenError::Sha256Mismatch);
            }
        }
        Ok(Self { bytes, header })
    }

    pub fn header(&self) -> &Header { &self.header }
    pub fn version(&self) -> Version {
        Version::from_byte(self.header.format_version).expect("validated at open")
    }
    pub fn engine_kind(&self) -> EngineKind {
        EngineKind::from_byte(self.header.engine_kind).expect("validated at open")
    }
    pub fn entry_count(&self) -> u32 { self.header.entry_count }
    pub fn sha256(&self) -> [u8; 32] { self.header.sha256_of_payload }

    /// All entries in entry-table order. O(n) — used by `prefix_top_k`
    /// fallback and by tests; production hot paths should go through
    /// the FST code / word indexes.
    pub fn entries(&self) -> impl Iterator<Item = Entry<'_>> + '_ {
        let buf = self.bytes.as_ref();
        let entry_table_start = self.header.entry_table_offset as usize;
        let n = self.header.entry_count as usize;
        let string_pool_start = self.header.string_pool_offset as usize;
        let string_pool_end =
            string_pool_start + self.header.string_pool_size as usize;
        let pool = &buf[string_pool_start..string_pool_end];
        (0..n).map(move |i| {
            let off = entry_table_start + i * crate::codec::ENTRY_SIZE;
            let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
                [off..off + crate::codec::ENTRY_SIZE]
                .try_into()
                .expect("entry slice is exactly ENTRY_SIZE");
            let rec = EntryRecord::parse(&rec_bytes);
            decode_entry(&rec, pool)
        })
    }

    /// Entry at the given index. `index` must be `< entry_count`.
    pub fn entry_at(&self, index: u32) -> Option<Entry<'_>> {
        if index >= self.header.entry_count {
            return None;
        }
        let buf = self.bytes.as_ref();
        let off = self.header.entry_table_offset as usize
            + index as usize * crate::codec::ENTRY_SIZE;
        let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
            [off..off + crate::codec::ENTRY_SIZE]
            .try_into()
            .ok()?;
        let rec = EntryRecord::parse(&rec_bytes);
        let pool = self.string_pool();
        Some(decode_entry(&rec, pool))
    }

    /// Lookup all entries whose `code` exactly matches the queried
    /// bytes. When the file carries an FST code index (v1.4.6 sub-
    /// phase C1 onwards), goes through the FST (O(|code|) instead of
    /// O(entry_count)) and walks the contiguous multi-reading run in
    /// the entry table. Falls back to a linear scan for v1.4.3-era
    /// files that ship with an empty FST section.
    pub fn lookup<'a>(&'a self, code: &[u8]) -> Vec<Entry<'a>> {
        let mut out: Vec<Entry<'a>> = Vec::new();
        if let Some(fst_bytes) = self.fst_code_index_bytes() {
            if let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) {
                if let Some(first) = fst.get(code) {
                    // Multi-reading run: entries are sorted by code, so
                    // all entries sharing this code are contiguous
                    // starting at `first`. Walk forward until the code
                    // changes or we hit EOF.
                    let total = self.header.entry_count as u64;
                    let mut idx = first;
                    while idx < total {
                        if let Some(e) = self.entry_at(idx as u32) {
                            if e.code.as_bytes() == code {
                                out.push(e);
                                idx += 1;
                                continue;
                            }
                        }
                        break;
                    }
                }
                return out;
            }
        }
        // Linear scan fallback (no FST or FST decode failed).
        for entry in self.entries() {
            if entry.code.as_bytes() == code {
                out.push(entry);
            }
        }
        out
    }

    /// Returns the raw FST code-index byte slice, or `None` when the
    /// file ships with an empty section (v1.4.3-era files).
    fn fst_code_index_bytes(&self) -> Option<&[u8]> {
        if self.header.fst_code_index_size == 0 {
            return None;
        }
        let buf = self.bytes.as_ref();
        let s = self.header.fst_code_index_offset as usize;
        let e = s + self.header.fst_code_index_size as usize;
        Some(&buf[s..e])
    }

    /// Reverse lookup by word. Same caveats as `lookup`.
    pub fn find_by_word<'a>(&'a self, word: &str) -> Vec<Entry<'a>> {
        let mut out: Vec<Entry<'a>> = Vec::new();
        for entry in self.entries() {
            if entry.word == word {
                out.push(entry);
            }
        }
        out
    }

    /// Top-k entries whose `code` starts with the prefix, ordered by
    /// `log_prior` desc. When the FST code index is populated, walks
    /// only the prefix subtree (O(matching codes) instead of
    /// O(entry_count)). Falls back to linear scan for v1.4.3-era files.
    pub fn prefix_top_k_fst<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
        if k == 0 { return Vec::new(); }
        let Some(fst_bytes) = self.fst_code_index_bytes() else {
            return self.prefix_top_k(prefix, k);
        };
        let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) else {
            return self.prefix_top_k(prefix, k);
        };
        let total = self.header.entry_count as u64;
        let mut hits: Vec<Entry<'a>> = Vec::new();
        fst.prefix_for_each(prefix, |_code, first_idx| {
            let mut idx = first_idx;
            while idx < total {
                if let Some(e) = self.entry_at(idx as u32) {
                    if e.code.as_bytes().starts_with(prefix) {
                        // Same code group?
                        if let Some(prev) = hits.last() {
                            if prev.code.as_bytes() == e.code.as_bytes() {
                                hits.push(e);
                                idx += 1;
                                continue;
                            }
                        }
                        // first reading of this code — caller iterates
                        // codes via FST so we only push the run for the
                        // current code.
                        if e.code.as_bytes() == &_code[..] {
                            hits.push(e);
                            idx += 1;
                            continue;
                        }
                    }
                }
                break;
            }
        });
        hits.sort_by(|a, b| {
            b.log_prior
                .cmp(&a.log_prior)
                .then_with(|| a.code.cmp(b.code))
        });
        hits.truncate(k);
        hits
    }

    /// Streaming visit of all entries whose `code` starts with `prefix`,
    /// FST-indexed (O(matching codes) instead of O(entry_count)). The
    /// callback receives each [`Entry`] by value (`Copy`), so callers
    /// can build their own ranking / top-k / cement-business-rule re-
    /// score over the result stream without paying for an interim
    /// `Vec` allocation or a fixed sort policy. Visit order is the
    /// FST's prefix-walk order (code-asc), with per-code multi-reading
    /// entries in entry-table order.
    ///
    /// Cement-level use case: the composite pinyin adapter's
    /// `push_prefix_top_k` and `single_letter_cache` need raw freq
    /// (via `estimated_freq_from_log_prior`) + word-length bias +
    /// proximity factor applied per entry before ranking — a fixed
    /// `prefix_top_k_fst` sort by `log_prior` desc would either drop
    /// would-be winners or force the cement to scan the full entry
    /// table to recover what FST already knows.
    ///
    /// Falls back to a linear scan + filter on v1.4.3-era files without
    /// a populated FST section.
    pub fn prefix_for_each_entry<'a, F: FnMut(Entry<'a>)>(
        &'a self,
        prefix: &[u8],
        mut visit: F,
    ) {
        if let Some(fst_bytes) = self.fst_code_index_bytes()
            && let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes)
        {
            let total = self.header.entry_count as u64;
            fst.prefix_for_each(prefix, |code, first_idx| {
                let mut idx = first_idx;
                while idx < total {
                    let Some(e) = self.entry_at(idx as u32) else { break; };
                    if e.code.as_bytes() != code { break; }
                    visit(e);
                    idx += 1;
                }
            });
            return;
        }
        // Linear scan fallback (v1.4.3-era files with empty FST section).
        for entry in self.entries() {
            if entry.code.as_bytes().starts_with(prefix) {
                visit(entry);
            }
        }
    }

    // ---- legacy / linear-scan path ----

    /// Original linear-scan top-k (v1.4.3 fallback).
    /// `log_prior` desc. Linear scan + top-k heap (BinaryHeap for std;
    /// sorted insert otherwise).
    pub fn prefix_top_k<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
        if k == 0 {
            return Vec::new();
        }
        let mut candidates: Vec<Entry<'a>> = self
            .entries()
            .filter(|e| e.code.as_bytes().starts_with(prefix))
            .collect();
        // Sort by log_prior desc, tiebreaker code asc for determinism.
        candidates.sort_by(|a, b| {
            b.log_prior
                .cmp(&a.log_prior)
                .then_with(|| a.code.cmp(b.code))
        });
        candidates.truncate(k);
        candidates
    }

    fn string_pool(&self) -> &[u8] {
        let buf = self.bytes.as_ref();
        let start = self.header.string_pool_offset as usize;
        let end = start + self.header.string_pool_size as usize;
        &buf[start..end]
    }
}

#[cfg(feature = "std")]
impl IdfReader<memmap2::Mmap> {
    /// Open an IDFv1 file at `path` via mmap. Zero-copy: lookups borrow
    /// directly from the mmap region. The reader holds the mmap alive;
    /// drop the reader to drop the mapping.
    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self, OpenError> {
        let file = std::fs::File::open(path)?;
        // SAFETY: caller's IDFv1 file is treated as immutable while the
        // mapping is alive. memmap2 leaves the file open via the handle.
        let mmap = unsafe { memmap2::Mmap::map(&file)? };
        Self::from_bytes(mmap)
    }
}

/// Decode an [`EntryRecord`] + string-pool region into a borrowed
/// [`Entry`]. The two `&str` borrows live as long as the pool slice.
fn decode_entry<'a>(rec: &EntryRecord, pool: &'a [u8]) -> Entry<'a> {
    let word = read_string(pool, rec.word_offset);
    let code = read_string(pool, rec.code_offset);
    Entry {
        word,
        code,
        log_prior: rec.log_prior,
        raw_freq: rec.raw_freq,
        match_type: decode_match_type(rec.match_type),
        flags: EntryFlags(rec.flags),
    }
}

/// Read a `\0`-terminated UTF-8 string from the pool at `offset`. Falls
/// back to an empty string when the offset is out of range — defensive
/// against bit-rot rather than a real expected case (writer enforces
/// well-formed offsets).
fn read_string(pool: &[u8], offset: u32) -> &str {
    let start = offset as usize;
    if start >= pool.len() {
        return "";
    }
    let rest = &pool[start..];
    let end = rest.iter().position(|&b| b == 0).unwrap_or(rest.len());
    core::str::from_utf8(&rest[..end]).unwrap_or("")
}