Skip to main content

inputx_dict_format/
reader.rs

1//! `IdfReader` — mmap zero-copy reader for IDFv1 files.
2//!
3//! Cold load: O(0) parse — open the file, validate the 96-byte header
4//! region, set up offset views into the mmap region. Lookup: direct
5//! pointer arithmetic, no allocation.
6
7use alloc::vec::Vec;
8use core::result::Result;
9
10use crate::codec::{
11    decode_match_type, EngineKind, EntryFlags, EntryRecord, Header, Version,
12    FULL_HEADER_SIZE, MAGIC,
13};
14
15/// Lookup error kind. Public so callers can distinguish "file is a
16/// different IDFv version" from "file is corrupt" — actionable
17/// difference: a v2 file should drive a tool upgrade prompt; a corrupt
18/// file should drive a rebuild.
19#[derive(Debug)]
20pub enum OpenError {
21    /// I/O failure while reading the file. Wraps the source.
22    #[cfg(feature = "std")]
23    Io(std::io::Error),
24    /// File is too short to hold even the header region.
25    TooShort,
26    /// First 4 bytes are not `b"IDFv"`. File is not an IDF dict at all.
27    BadMagic,
28    /// `format_version` byte is not `1`. A future v2 reader will accept
29    /// these; v1 reader returns this so the caller can prompt a rebuild
30    /// against the older tool.
31    UnsupportedVersion(u8),
32    /// `engine_kind` byte is outside the known set (per
33    /// [`EngineKind::from_byte`]).
34    UnknownEngineKind(u8),
35    /// Header section offsets disagree with the file size — points past
36    /// EOF or overlap. Indicates a truncated or rewritten file.
37    CorruptOffsets,
38    /// sha256_of_payload in the header does not match a fresh hash of
39    /// the post-header bytes. Indicates the file was tampered with or
40    /// corrupted between build and load.
41    Sha256Mismatch,
42}
43
44#[cfg(feature = "std")]
45impl From<std::io::Error> for OpenError {
46    fn from(e: std::io::Error) -> Self { Self::Io(e) }
47}
48
49/// A read-only entry view backed by mmap (when `IdfReader` was opened
50/// from a file) or by an owned byte buffer (in `no_std` use). `word`
51/// and `code` borrow string-pool bytes; the lifetime ties them to the
52/// underlying reader.
53#[derive(Copy, Clone, Debug)]
54pub struct Entry<'a> {
55    pub word: &'a str,
56    pub code: &'a str,
57    pub log_prior: i16,
58    /// Pre-quantization corpus frequency. Added v1.4.7 sub-phase A4
59    /// as a lossless tiebreaker for entries that share the same Q4
60    /// `log_prior` bucket — without it, cement-side cement can only
61    /// recover an estimated freq via `exp(log_prior/Q4) - 1`, which
62    /// collides exactly for bucket-mates. `0` on legacy / synthetic
63    /// .idf blobs (the byte slot was previously an unused
64    /// `bigram_offset` placeholder, always zero).
65    pub raw_freq: u32,
66    pub match_type: inputx_scoring::MatchType,
67    pub flags: EntryFlags,
68}
69
70/// IDFv1 reader. Holds the file's byte region and the parsed header.
71/// `entry_index` lookups are constant-time arithmetic; FST code/word
72/// scans use `inputx_fsa` directly over the index sections.
73pub struct IdfReader<B>
74where
75    B: AsRef<[u8]>,
76{
77    bytes: B,
78    header: Header,
79}
80
81impl<B: AsRef<[u8]>> IdfReader<B> {
82    /// Build a reader from an in-memory byte region. Validates the
83    /// header and (in std builds) the sha256 of payload.
84    pub fn from_bytes(bytes: B) -> Result<Self, OpenError> {
85        let buf = bytes.as_ref();
86        if buf.len() < FULL_HEADER_SIZE {
87            return Err(OpenError::TooShort);
88        }
89        if buf[0..4] != MAGIC {
90            return Err(OpenError::BadMagic);
91        }
92        let header = Header::parse(buf).ok_or(OpenError::BadMagic)?;
93        if Version::from_byte(header.format_version).is_none() {
94            return Err(OpenError::UnsupportedVersion(header.format_version));
95        }
96        if EngineKind::from_byte(header.engine_kind).is_none() {
97            return Err(OpenError::UnknownEngineKind(header.engine_kind));
98        }
99        // Bounds-check the section offsets against the file size — any
100        // off-by-N or attacker-tampered file slips out here.
101        let file_len = buf.len() as u32;
102        for (off, sz) in [
103            (header.string_pool_offset, header.string_pool_size),
104            (
105                header.entry_table_offset,
106                header.entry_count.saturating_mul(crate::codec::ENTRY_SIZE as u32),
107            ),
108            (header.fst_code_index_offset, header.fst_code_index_size),
109            (header.fst_word_index_offset, header.fst_word_index_size),
110        ] {
111            if off > file_len || off.saturating_add(sz) > file_len {
112                return Err(OpenError::CorruptOffsets);
113            }
114        }
115        // Verify sha256 of payload (covers everything after the sha256
116        // field; offset FULL_HEADER_SIZE .. file_end).
117        #[cfg(feature = "std")]
118        {
119            use sha2::{Digest, Sha256};
120            let mut hasher = Sha256::new();
121            hasher.update(&buf[FULL_HEADER_SIZE..]);
122            let got: [u8; 32] = hasher.finalize().into();
123            if got != header.sha256_of_payload {
124                return Err(OpenError::Sha256Mismatch);
125            }
126        }
127        Ok(Self { bytes, header })
128    }
129
130    pub fn header(&self) -> &Header { &self.header }
131    pub fn version(&self) -> Version {
132        Version::from_byte(self.header.format_version).expect("validated at open")
133    }
134    pub fn engine_kind(&self) -> EngineKind {
135        EngineKind::from_byte(self.header.engine_kind).expect("validated at open")
136    }
137    pub fn entry_count(&self) -> u32 { self.header.entry_count }
138    pub fn sha256(&self) -> [u8; 32] { self.header.sha256_of_payload }
139
140    /// All entries in entry-table order. O(n) — used by `prefix_top_k`
141    /// fallback and by tests; production hot paths should go through
142    /// the FST code / word indexes.
143    pub fn entries(&self) -> impl Iterator<Item = Entry<'_>> + '_ {
144        let buf = self.bytes.as_ref();
145        let entry_table_start = self.header.entry_table_offset as usize;
146        let n = self.header.entry_count as usize;
147        let string_pool_start = self.header.string_pool_offset as usize;
148        let string_pool_end =
149            string_pool_start + self.header.string_pool_size as usize;
150        let pool = &buf[string_pool_start..string_pool_end];
151        (0..n).map(move |i| {
152            let off = entry_table_start + i * crate::codec::ENTRY_SIZE;
153            let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
154                [off..off + crate::codec::ENTRY_SIZE]
155                .try_into()
156                .expect("entry slice is exactly ENTRY_SIZE");
157            let rec = EntryRecord::parse(&rec_bytes);
158            decode_entry(&rec, pool)
159        })
160    }
161
162    /// Entry at the given index. `index` must be `< entry_count`.
163    pub fn entry_at(&self, index: u32) -> Option<Entry<'_>> {
164        if index >= self.header.entry_count {
165            return None;
166        }
167        let buf = self.bytes.as_ref();
168        let off = self.header.entry_table_offset as usize
169            + index as usize * crate::codec::ENTRY_SIZE;
170        let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
171            [off..off + crate::codec::ENTRY_SIZE]
172            .try_into()
173            .ok()?;
174        let rec = EntryRecord::parse(&rec_bytes);
175        let pool = self.string_pool();
176        Some(decode_entry(&rec, pool))
177    }
178
179    /// Lookup all entries whose `code` exactly matches the queried
180    /// bytes. When the file carries an FST code index (v1.4.6 sub-
181    /// phase C1 onwards), goes through the FST (O(|code|) instead of
182    /// O(entry_count)) and walks the contiguous multi-reading run in
183    /// the entry table. Falls back to a linear scan for v1.4.3-era
184    /// files that ship with an empty FST section.
185    pub fn lookup<'a>(&'a self, code: &[u8]) -> Vec<Entry<'a>> {
186        let mut out: Vec<Entry<'a>> = Vec::new();
187        if let Some(fst_bytes) = self.fst_code_index_bytes() {
188            if let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) {
189                if let Some(first) = fst.get(code) {
190                    // Multi-reading run: entries are sorted by code, so
191                    // all entries sharing this code are contiguous
192                    // starting at `first`. Walk forward until the code
193                    // changes or we hit EOF.
194                    let total = self.header.entry_count as u64;
195                    let mut idx = first;
196                    while idx < total {
197                        if let Some(e) = self.entry_at(idx as u32) {
198                            if e.code.as_bytes() == code {
199                                out.push(e);
200                                idx += 1;
201                                continue;
202                            }
203                        }
204                        break;
205                    }
206                }
207                return out;
208            }
209        }
210        // Linear scan fallback (no FST or FST decode failed).
211        for entry in self.entries() {
212            if entry.code.as_bytes() == code {
213                out.push(entry);
214            }
215        }
216        out
217    }
218
219    /// Returns the raw FST code-index byte slice, or `None` when the
220    /// file ships with an empty section (v1.4.3-era files).
221    fn fst_code_index_bytes(&self) -> Option<&[u8]> {
222        if self.header.fst_code_index_size == 0 {
223            return None;
224        }
225        let buf = self.bytes.as_ref();
226        let s = self.header.fst_code_index_offset as usize;
227        let e = s + self.header.fst_code_index_size as usize;
228        Some(&buf[s..e])
229    }
230
231    /// Reverse lookup by word. Same caveats as `lookup`.
232    pub fn find_by_word<'a>(&'a self, word: &str) -> Vec<Entry<'a>> {
233        let mut out: Vec<Entry<'a>> = Vec::new();
234        for entry in self.entries() {
235            if entry.word == word {
236                out.push(entry);
237            }
238        }
239        out
240    }
241
242    /// Top-k entries whose `code` starts with the prefix, ordered by
243    /// `log_prior` desc. When the FST code index is populated, walks
244    /// only the prefix subtree (O(matching codes) instead of
245    /// O(entry_count)). Falls back to linear scan for v1.4.3-era files.
246    pub fn prefix_top_k_fst<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
247        if k == 0 { return Vec::new(); }
248        let Some(fst_bytes) = self.fst_code_index_bytes() else {
249            return self.prefix_top_k(prefix, k);
250        };
251        let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) else {
252            return self.prefix_top_k(prefix, k);
253        };
254        let total = self.header.entry_count as u64;
255        let mut hits: Vec<Entry<'a>> = Vec::new();
256        fst.prefix_for_each(prefix, |_code, first_idx| {
257            let mut idx = first_idx;
258            while idx < total {
259                if let Some(e) = self.entry_at(idx as u32) {
260                    if e.code.as_bytes().starts_with(prefix) {
261                        // Same code group?
262                        if let Some(prev) = hits.last() {
263                            if prev.code.as_bytes() == e.code.as_bytes() {
264                                hits.push(e);
265                                idx += 1;
266                                continue;
267                            }
268                        }
269                        // first reading of this code — caller iterates
270                        // codes via FST so we only push the run for the
271                        // current code.
272                        if e.code.as_bytes() == &_code[..] {
273                            hits.push(e);
274                            idx += 1;
275                            continue;
276                        }
277                    }
278                }
279                break;
280            }
281        });
282        hits.sort_by(|a, b| {
283            b.log_prior
284                .cmp(&a.log_prior)
285                .then_with(|| a.code.cmp(b.code))
286        });
287        hits.truncate(k);
288        hits
289    }
290
291    /// Streaming visit of all entries whose `code` starts with `prefix`,
292    /// FST-indexed (O(matching codes) instead of O(entry_count)). The
293    /// callback receives each [`Entry`] by value (`Copy`), so callers
294    /// can build their own ranking / top-k / cement-business-rule re-
295    /// score over the result stream without paying for an interim
296    /// `Vec` allocation or a fixed sort policy. Visit order is the
297    /// FST's prefix-walk order (code-asc), with per-code multi-reading
298    /// entries in entry-table order.
299    ///
300    /// Cement-level use case: the composite pinyin adapter's
301    /// `push_prefix_top_k` and `single_letter_cache` need raw freq
302    /// (via `estimated_freq_from_log_prior`) + word-length bias +
303    /// proximity factor applied per entry before ranking — a fixed
304    /// `prefix_top_k_fst` sort by `log_prior` desc would either drop
305    /// would-be winners or force the cement to scan the full entry
306    /// table to recover what FST already knows.
307    ///
308    /// Falls back to a linear scan + filter on v1.4.3-era files without
309    /// a populated FST section.
310    pub fn prefix_for_each_entry<'a, F: FnMut(Entry<'a>)>(
311        &'a self,
312        prefix: &[u8],
313        mut visit: F,
314    ) {
315        if let Some(fst_bytes) = self.fst_code_index_bytes()
316            && let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes)
317        {
318            let total = self.header.entry_count as u64;
319            fst.prefix_for_each(prefix, |code, first_idx| {
320                let mut idx = first_idx;
321                while idx < total {
322                    let Some(e) = self.entry_at(idx as u32) else { break; };
323                    if e.code.as_bytes() != code { break; }
324                    visit(e);
325                    idx += 1;
326                }
327            });
328            return;
329        }
330        // Linear scan fallback (v1.4.3-era files with empty FST section).
331        for entry in self.entries() {
332            if entry.code.as_bytes().starts_with(prefix) {
333                visit(entry);
334            }
335        }
336    }
337
338    // ---- legacy / linear-scan path ----
339
340    /// Original linear-scan top-k (v1.4.3 fallback).
341    /// `log_prior` desc. Linear scan + top-k heap (BinaryHeap for std;
342    /// sorted insert otherwise).
343    pub fn prefix_top_k<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
344        if k == 0 {
345            return Vec::new();
346        }
347        let mut candidates: Vec<Entry<'a>> = self
348            .entries()
349            .filter(|e| e.code.as_bytes().starts_with(prefix))
350            .collect();
351        // Sort by log_prior desc, tiebreaker code asc for determinism.
352        candidates.sort_by(|a, b| {
353            b.log_prior
354                .cmp(&a.log_prior)
355                .then_with(|| a.code.cmp(b.code))
356        });
357        candidates.truncate(k);
358        candidates
359    }
360
361    fn string_pool(&self) -> &[u8] {
362        let buf = self.bytes.as_ref();
363        let start = self.header.string_pool_offset as usize;
364        let end = start + self.header.string_pool_size as usize;
365        &buf[start..end]
366    }
367}
368
369#[cfg(feature = "std")]
370impl IdfReader<memmap2::Mmap> {
371    /// Open an IDFv1 file at `path` via mmap. Zero-copy: lookups borrow
372    /// directly from the mmap region. The reader holds the mmap alive;
373    /// drop the reader to drop the mapping.
374    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self, OpenError> {
375        let file = std::fs::File::open(path)?;
376        // SAFETY: caller's IDFv1 file is treated as immutable while the
377        // mapping is alive. memmap2 leaves the file open via the handle.
378        let mmap = unsafe { memmap2::Mmap::map(&file)? };
379        Self::from_bytes(mmap)
380    }
381}
382
383/// Decode an [`EntryRecord`] + string-pool region into a borrowed
384/// [`Entry`]. The two `&str` borrows live as long as the pool slice.
385fn decode_entry<'a>(rec: &EntryRecord, pool: &'a [u8]) -> Entry<'a> {
386    let word = read_string(pool, rec.word_offset);
387    let code = read_string(pool, rec.code_offset);
388    Entry {
389        word,
390        code,
391        log_prior: rec.log_prior,
392        raw_freq: rec.raw_freq,
393        match_type: decode_match_type(rec.match_type),
394        flags: EntryFlags(rec.flags),
395    }
396}
397
398/// Read a `\0`-terminated UTF-8 string from the pool at `offset`. Falls
399/// back to an empty string when the offset is out of range — defensive
400/// against bit-rot rather than a real expected case (writer enforces
401/// well-formed offsets).
402fn read_string(pool: &[u8], offset: u32) -> &str {
403    let start = offset as usize;
404    if start >= pool.len() {
405        return "";
406    }
407    let rest = &pool[start..];
408    let end = rest.iter().position(|&b| b == 0).unwrap_or(rest.len());
409    core::str::from_utf8(&rest[..end]).unwrap_or("")
410}
411