inputx_dict_format/reader.rs
1//! `IdfReader` — mmap zero-copy reader for IDFv1 files.
2//!
3//! Cold load: O(0) parse — open the file, validate the 96-byte header
4//! region, set up offset views into the mmap region. Lookup: direct
5//! pointer arithmetic, no allocation.
6
7use alloc::vec::Vec;
8use core::result::Result;
9
10use crate::codec::{
11 decode_match_type, EngineKind, EntryFlags, EntryRecord, Header, Version,
12 FULL_HEADER_SIZE, MAGIC,
13};
14
15/// Lookup error kind. Public so callers can distinguish "file is a
16/// different IDFv version" from "file is corrupt" — actionable
17/// difference: a v2 file should drive a tool upgrade prompt; a corrupt
18/// file should drive a rebuild.
19#[derive(Debug)]
20pub enum OpenError {
21 /// I/O failure while reading the file. Wraps the source.
22 #[cfg(feature = "std")]
23 Io(std::io::Error),
24 /// File is too short to hold even the header region.
25 TooShort,
26 /// First 4 bytes are not `b"IDFv"`. File is not an IDF dict at all.
27 BadMagic,
28 /// `format_version` byte is not `1`. A future v2 reader will accept
29 /// these; v1 reader returns this so the caller can prompt a rebuild
30 /// against the older tool.
31 UnsupportedVersion(u8),
32 /// `engine_kind` byte is outside the known set (per
33 /// [`EngineKind::from_byte`]).
34 UnknownEngineKind(u8),
35 /// Header section offsets disagree with the file size — points past
36 /// EOF or overlap. Indicates a truncated or rewritten file.
37 CorruptOffsets,
38 /// sha256_of_payload in the header does not match a fresh hash of
39 /// the post-header bytes. Indicates the file was tampered with or
40 /// corrupted between build and load.
41 Sha256Mismatch,
42}
43
44#[cfg(feature = "std")]
45impl From<std::io::Error> for OpenError {
46 fn from(e: std::io::Error) -> Self { Self::Io(e) }
47}
48
49/// A read-only entry view backed by mmap (when `IdfReader` was opened
50/// from a file) or by an owned byte buffer (in `no_std` use). `word`
51/// and `code` borrow string-pool bytes; the lifetime ties them to the
52/// underlying reader.
53#[derive(Copy, Clone, Debug)]
54pub struct Entry<'a> {
55 pub word: &'a str,
56 pub code: &'a str,
57 pub log_prior: i16,
58 /// Pre-quantization corpus frequency. Added v1.4.7 sub-phase A4
59 /// as a lossless tiebreaker for entries that share the same Q4
60 /// `log_prior` bucket — without it, cement-side cement can only
61 /// recover an estimated freq via `exp(log_prior/Q4) - 1`, which
62 /// collides exactly for bucket-mates. `0` on legacy / synthetic
63 /// .idf blobs (the byte slot was previously an unused
64 /// `bigram_offset` placeholder, always zero).
65 pub raw_freq: u32,
66 pub match_type: inputx_scoring::MatchType,
67 pub flags: EntryFlags,
68}
69
70/// IDFv1 reader. Holds the file's byte region and the parsed header.
71/// `entry_index` lookups are constant-time arithmetic; FST code/word
72/// scans use `inputx_fsa` directly over the index sections.
73pub struct IdfReader<B>
74where
75 B: AsRef<[u8]>,
76{
77 bytes: B,
78 header: Header,
79}
80
81impl<B: AsRef<[u8]>> IdfReader<B> {
82 /// Build a reader from an in-memory byte region. Validates the
83 /// header and (in std builds) the sha256 of payload.
84 pub fn from_bytes(bytes: B) -> Result<Self, OpenError> {
85 let buf = bytes.as_ref();
86 if buf.len() < FULL_HEADER_SIZE {
87 return Err(OpenError::TooShort);
88 }
89 if buf[0..4] != MAGIC {
90 return Err(OpenError::BadMagic);
91 }
92 let header = Header::parse(buf).ok_or(OpenError::BadMagic)?;
93 if Version::from_byte(header.format_version).is_none() {
94 return Err(OpenError::UnsupportedVersion(header.format_version));
95 }
96 if EngineKind::from_byte(header.engine_kind).is_none() {
97 return Err(OpenError::UnknownEngineKind(header.engine_kind));
98 }
99 // Bounds-check the section offsets against the file size — any
100 // off-by-N or attacker-tampered file slips out here.
101 let file_len = buf.len() as u32;
102 for (off, sz) in [
103 (header.string_pool_offset, header.string_pool_size),
104 (
105 header.entry_table_offset,
106 header.entry_count.saturating_mul(crate::codec::ENTRY_SIZE as u32),
107 ),
108 (header.fst_code_index_offset, header.fst_code_index_size),
109 (header.fst_word_index_offset, header.fst_word_index_size),
110 ] {
111 if off > file_len || off.saturating_add(sz) > file_len {
112 return Err(OpenError::CorruptOffsets);
113 }
114 }
115 // Verify sha256 of payload (covers everything after the sha256
116 // field; offset FULL_HEADER_SIZE .. file_end).
117 #[cfg(feature = "std")]
118 {
119 use sha2::{Digest, Sha256};
120 let mut hasher = Sha256::new();
121 hasher.update(&buf[FULL_HEADER_SIZE..]);
122 let got: [u8; 32] = hasher.finalize().into();
123 if got != header.sha256_of_payload {
124 return Err(OpenError::Sha256Mismatch);
125 }
126 }
127 Ok(Self { bytes, header })
128 }
129
130 pub fn header(&self) -> &Header { &self.header }
131 pub fn version(&self) -> Version {
132 Version::from_byte(self.header.format_version).expect("validated at open")
133 }
134 pub fn engine_kind(&self) -> EngineKind {
135 EngineKind::from_byte(self.header.engine_kind).expect("validated at open")
136 }
137 pub fn entry_count(&self) -> u32 { self.header.entry_count }
138 pub fn sha256(&self) -> [u8; 32] { self.header.sha256_of_payload }
139
140 /// All entries in entry-table order. O(n) — used by `prefix_top_k`
141 /// fallback and by tests; production hot paths should go through
142 /// the FST code / word indexes.
143 pub fn entries(&self) -> impl Iterator<Item = Entry<'_>> + '_ {
144 let buf = self.bytes.as_ref();
145 let entry_table_start = self.header.entry_table_offset as usize;
146 let n = self.header.entry_count as usize;
147 let string_pool_start = self.header.string_pool_offset as usize;
148 let string_pool_end =
149 string_pool_start + self.header.string_pool_size as usize;
150 let pool = &buf[string_pool_start..string_pool_end];
151 (0..n).map(move |i| {
152 let off = entry_table_start + i * crate::codec::ENTRY_SIZE;
153 let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
154 [off..off + crate::codec::ENTRY_SIZE]
155 .try_into()
156 .expect("entry slice is exactly ENTRY_SIZE");
157 let rec = EntryRecord::parse(&rec_bytes);
158 decode_entry(&rec, pool)
159 })
160 }
161
162 /// Entry at the given index. `index` must be `< entry_count`.
163 pub fn entry_at(&self, index: u32) -> Option<Entry<'_>> {
164 if index >= self.header.entry_count {
165 return None;
166 }
167 let buf = self.bytes.as_ref();
168 let off = self.header.entry_table_offset as usize
169 + index as usize * crate::codec::ENTRY_SIZE;
170 let rec_bytes: [u8; crate::codec::ENTRY_SIZE] = buf
171 [off..off + crate::codec::ENTRY_SIZE]
172 .try_into()
173 .ok()?;
174 let rec = EntryRecord::parse(&rec_bytes);
175 let pool = self.string_pool();
176 Some(decode_entry(&rec, pool))
177 }
178
179 /// Lookup all entries whose `code` exactly matches the queried
180 /// bytes. When the file carries an FST code index (v1.4.6 sub-
181 /// phase C1 onwards), goes through the FST (O(|code|) instead of
182 /// O(entry_count)) and walks the contiguous multi-reading run in
183 /// the entry table. Falls back to a linear scan for v1.4.3-era
184 /// files that ship with an empty FST section.
185 pub fn lookup<'a>(&'a self, code: &[u8]) -> Vec<Entry<'a>> {
186 let mut out: Vec<Entry<'a>> = Vec::new();
187 if let Some(fst_bytes) = self.fst_code_index_bytes() {
188 if let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) {
189 if let Some(first) = fst.get(code) {
190 // Multi-reading run: entries are sorted by code, so
191 // all entries sharing this code are contiguous
192 // starting at `first`. Walk forward until the code
193 // changes or we hit EOF.
194 let total = self.header.entry_count as u64;
195 let mut idx = first;
196 while idx < total {
197 if let Some(e) = self.entry_at(idx as u32) {
198 if e.code.as_bytes() == code {
199 out.push(e);
200 idx += 1;
201 continue;
202 }
203 }
204 break;
205 }
206 }
207 return out;
208 }
209 }
210 // Linear scan fallback (no FST or FST decode failed).
211 for entry in self.entries() {
212 if entry.code.as_bytes() == code {
213 out.push(entry);
214 }
215 }
216 out
217 }
218
219 /// Returns the raw FST code-index byte slice, or `None` when the
220 /// file ships with an empty section (v1.4.3-era files).
221 fn fst_code_index_bytes(&self) -> Option<&[u8]> {
222 if self.header.fst_code_index_size == 0 {
223 return None;
224 }
225 let buf = self.bytes.as_ref();
226 let s = self.header.fst_code_index_offset as usize;
227 let e = s + self.header.fst_code_index_size as usize;
228 Some(&buf[s..e])
229 }
230
231 /// Reverse lookup by word. Same caveats as `lookup`.
232 pub fn find_by_word<'a>(&'a self, word: &str) -> Vec<Entry<'a>> {
233 let mut out: Vec<Entry<'a>> = Vec::new();
234 for entry in self.entries() {
235 if entry.word == word {
236 out.push(entry);
237 }
238 }
239 out
240 }
241
242 /// Top-k entries whose `code` starts with the prefix, ordered by
243 /// `log_prior` desc. When the FST code index is populated, walks
244 /// only the prefix subtree (O(matching codes) instead of
245 /// O(entry_count)). Falls back to linear scan for v1.4.3-era files.
246 pub fn prefix_top_k_fst<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
247 if k == 0 { return Vec::new(); }
248 let Some(fst_bytes) = self.fst_code_index_bytes() else {
249 return self.prefix_top_k(prefix, k);
250 };
251 let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes) else {
252 return self.prefix_top_k(prefix, k);
253 };
254 let total = self.header.entry_count as u64;
255 let mut hits: Vec<Entry<'a>> = Vec::new();
256 fst.prefix_for_each(prefix, |_code, first_idx| {
257 let mut idx = first_idx;
258 while idx < total {
259 if let Some(e) = self.entry_at(idx as u32) {
260 if e.code.as_bytes().starts_with(prefix) {
261 // Same code group?
262 if let Some(prev) = hits.last() {
263 if prev.code.as_bytes() == e.code.as_bytes() {
264 hits.push(e);
265 idx += 1;
266 continue;
267 }
268 }
269 // first reading of this code — caller iterates
270 // codes via FST so we only push the run for the
271 // current code.
272 if e.code.as_bytes() == &_code[..] {
273 hits.push(e);
274 idx += 1;
275 continue;
276 }
277 }
278 }
279 break;
280 }
281 });
282 hits.sort_by(|a, b| {
283 b.log_prior
284 .cmp(&a.log_prior)
285 .then_with(|| a.code.cmp(b.code))
286 });
287 hits.truncate(k);
288 hits
289 }
290
291 /// Streaming visit of all entries whose `code` starts with `prefix`,
292 /// FST-indexed (O(matching codes) instead of O(entry_count)). The
293 /// callback receives each [`Entry`] by value (`Copy`), so callers
294 /// can build their own ranking / top-k / cement-business-rule re-
295 /// score over the result stream without paying for an interim
296 /// `Vec` allocation or a fixed sort policy. Visit order is the
297 /// FST's prefix-walk order (code-asc), with per-code multi-reading
298 /// entries in entry-table order.
299 ///
300 /// Cement-level use case: the composite pinyin adapter's
301 /// `push_prefix_top_k` and `single_letter_cache` need raw freq
302 /// (via `estimated_freq_from_log_prior`) + word-length bias +
303 /// proximity factor applied per entry before ranking — a fixed
304 /// `prefix_top_k_fst` sort by `log_prior` desc would either drop
305 /// would-be winners or force the cement to scan the full entry
306 /// table to recover what FST already knows.
307 ///
308 /// Falls back to a linear scan + filter on v1.4.3-era files without
309 /// a populated FST section.
310 pub fn prefix_for_each_entry<'a, F: FnMut(Entry<'a>)>(
311 &'a self,
312 prefix: &[u8],
313 mut visit: F,
314 ) {
315 if let Some(fst_bytes) = self.fst_code_index_bytes()
316 && let Ok(fst) = inputx_fsa::Fsa::new(fst_bytes)
317 {
318 let total = self.header.entry_count as u64;
319 fst.prefix_for_each(prefix, |code, first_idx| {
320 let mut idx = first_idx;
321 while idx < total {
322 let Some(e) = self.entry_at(idx as u32) else { break; };
323 if e.code.as_bytes() != code { break; }
324 visit(e);
325 idx += 1;
326 }
327 });
328 return;
329 }
330 // Linear scan fallback (v1.4.3-era files with empty FST section).
331 for entry in self.entries() {
332 if entry.code.as_bytes().starts_with(prefix) {
333 visit(entry);
334 }
335 }
336 }
337
338 // ---- legacy / linear-scan path ----
339
340 /// Original linear-scan top-k (v1.4.3 fallback).
341 /// `log_prior` desc. Linear scan + top-k heap (BinaryHeap for std;
342 /// sorted insert otherwise).
343 pub fn prefix_top_k<'a>(&'a self, prefix: &[u8], k: usize) -> Vec<Entry<'a>> {
344 if k == 0 {
345 return Vec::new();
346 }
347 let mut candidates: Vec<Entry<'a>> = self
348 .entries()
349 .filter(|e| e.code.as_bytes().starts_with(prefix))
350 .collect();
351 // Sort by log_prior desc, tiebreaker code asc for determinism.
352 candidates.sort_by(|a, b| {
353 b.log_prior
354 .cmp(&a.log_prior)
355 .then_with(|| a.code.cmp(b.code))
356 });
357 candidates.truncate(k);
358 candidates
359 }
360
361 fn string_pool(&self) -> &[u8] {
362 let buf = self.bytes.as_ref();
363 let start = self.header.string_pool_offset as usize;
364 let end = start + self.header.string_pool_size as usize;
365 &buf[start..end]
366 }
367}
368
369#[cfg(feature = "std")]
370impl IdfReader<memmap2::Mmap> {
371 /// Open an IDFv1 file at `path` via mmap. Zero-copy: lookups borrow
372 /// directly from the mmap region. The reader holds the mmap alive;
373 /// drop the reader to drop the mapping.
374 pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self, OpenError> {
375 let file = std::fs::File::open(path)?;
376 // SAFETY: caller's IDFv1 file is treated as immutable while the
377 // mapping is alive. memmap2 leaves the file open via the handle.
378 let mmap = unsafe { memmap2::Mmap::map(&file)? };
379 Self::from_bytes(mmap)
380 }
381}
382
383/// Decode an [`EntryRecord`] + string-pool region into a borrowed
384/// [`Entry`]. The two `&str` borrows live as long as the pool slice.
385fn decode_entry<'a>(rec: &EntryRecord, pool: &'a [u8]) -> Entry<'a> {
386 let word = read_string(pool, rec.word_offset);
387 let code = read_string(pool, rec.code_offset);
388 Entry {
389 word,
390 code,
391 log_prior: rec.log_prior,
392 raw_freq: rec.raw_freq,
393 match_type: decode_match_type(rec.match_type),
394 flags: EntryFlags(rec.flags),
395 }
396}
397
398/// Read a `\0`-terminated UTF-8 string from the pool at `offset`. Falls
399/// back to an empty string when the offset is out of range — defensive
400/// against bit-rot rather than a real expected case (writer enforces
401/// well-formed offsets).
402fn read_string(pool: &[u8], offset: u32) -> &str {
403 let start = offset as usize;
404 if start >= pool.len() {
405 return "";
406 }
407 let rest = &pool[start..];
408 let end = rest.iter().position(|&b| b == 0).unwrap_or(rest.len());
409 core::str::from_utf8(&rest[..end]).unwrap_or("")
410}
411