1use core::convert::TryInto;
9
10pub const MAGIC: [u8; 4] = *b"IDFv";
12
13pub const HEADER_SIZE: usize = 64;
15
16pub const ENTRY_SIZE: usize = 16;
18
19#[repr(u8)]
24#[derive(Copy, Clone, Debug, PartialEq, Eq)]
25pub enum Version {
26 V1 = 1,
27}
28
29impl Version {
30 pub fn from_byte(b: u8) -> Option<Self> {
33 match b {
34 1 => Some(Self::V1),
35 _ => None,
36 }
37 }
38}
39
40#[repr(u8)]
43#[derive(Copy, Clone, Debug, PartialEq, Eq)]
44pub enum EngineKind {
45 Pinyin = 0,
46 Wubi = 1,
47 NihongoJukugo = 2,
48 NihongoKanji = 3,
49 Other = 4,
50}
51
52impl EngineKind {
53 pub fn from_byte(b: u8) -> Option<Self> {
54 match b {
55 0 => Some(Self::Pinyin),
56 1 => Some(Self::Wubi),
57 2 => Some(Self::NihongoJukugo),
58 3 => Some(Self::NihongoKanji),
59 4 => Some(Self::Other),
60 _ => None,
61 }
62 }
63}
64
65#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
67pub struct EntryFlags(pub u8);
68
69impl EntryFlags {
70 pub const BLACKLIST: u8 = 1 << 0;
71 pub const CURATED_OVERRIDE: u8 = 1 << 1;
72 pub const USER_ADDED: u8 = 1 << 2;
73 pub const ENGINE_TAG_MASK: u8 = 0b1110_0000;
81 pub const ENGINE_TAG_SHIFT: u8 = 5;
82
83 pub fn is_blacklisted(self) -> bool { self.0 & Self::BLACKLIST != 0 }
84 pub fn is_curated_override(self) -> bool { self.0 & Self::CURATED_OVERRIDE != 0 }
85 pub fn is_user_added(self) -> bool { self.0 & Self::USER_ADDED != 0 }
86 pub fn engine_tag(self) -> u8 {
89 (self.0 & Self::ENGINE_TAG_MASK) >> Self::ENGINE_TAG_SHIFT
90 }
91 pub fn with_engine_tag(self, tag: u8) -> Self {
96 let cleared = self.0 & !Self::ENGINE_TAG_MASK;
97 Self(cleared | ((tag & 0b111) << Self::ENGINE_TAG_SHIFT))
98 }
99}
100
101#[derive(Copy, Clone, Debug, PartialEq, Eq)]
105pub struct Header {
106 pub magic: [u8; 4],
107 pub format_version: u8,
108 pub engine_kind: u8,
109 pub flags: u16,
110 pub entry_count: u32,
111 pub string_pool_offset: u32,
112 pub string_pool_size: u32,
113 pub entry_table_offset: u32,
114 pub fst_code_index_offset: u32,
115 pub fst_code_index_size: u32,
116 pub fst_word_index_offset: u32,
117 pub fst_word_index_size: u32,
118 pub bigram_offset: u32,
119 pub bigram_size: u32,
120 pub embedding_offset: u32,
121 pub embedding_dim: u16,
122 pub embedding_dtype: u8,
123 pub reserved: u8,
124 pub sha256_of_payload: [u8; 32],
125}
126
127impl Header {
128 pub fn to_bytes(&self) -> [u8; HEADER_SIZE] {
130 let mut buf = [0u8; HEADER_SIZE];
131 buf[0..4].copy_from_slice(&self.magic);
132 buf[4] = self.format_version;
133 buf[5] = self.engine_kind;
134 buf[6..8].copy_from_slice(&self.flags.to_le_bytes());
135 buf[8..12].copy_from_slice(&self.entry_count.to_le_bytes());
136 buf[12..16].copy_from_slice(&self.string_pool_offset.to_le_bytes());
137 buf[16..20].copy_from_slice(&self.string_pool_size.to_le_bytes());
138 buf[20..24].copy_from_slice(&self.entry_table_offset.to_le_bytes());
139 buf[24..28].copy_from_slice(&self.fst_code_index_offset.to_le_bytes());
140 buf[28..32].copy_from_slice(&self.fst_code_index_size.to_le_bytes());
141 buf[32..36].copy_from_slice(&self.fst_word_index_offset.to_le_bytes());
142 buf[36..40].copy_from_slice(&self.fst_word_index_size.to_le_bytes());
143 buf[40..44].copy_from_slice(&self.bigram_offset.to_le_bytes());
144 buf[44..48].copy_from_slice(&self.bigram_size.to_le_bytes());
145 buf[48..52].copy_from_slice(&self.embedding_offset.to_le_bytes());
146 buf[52..54].copy_from_slice(&self.embedding_dim.to_le_bytes());
147 buf[54] = self.embedding_dtype;
148 buf[55] = self.reserved;
149 buf
162 }
163
164 pub fn parse(buf: &[u8]) -> Option<Self> {
167 if buf.len() < HEADER_SIZE + 32 { return None; }
168 if buf[0..4] != MAGIC { return None; }
169 let mut sha = [0u8; 32];
170 sha.copy_from_slice(&buf[HEADER_SIZE..HEADER_SIZE + 32]);
171 Some(Self {
172 magic: MAGIC,
173 format_version: buf[4],
174 engine_kind: buf[5],
175 flags: u16::from_le_bytes(buf[6..8].try_into().ok()?),
176 entry_count: u32::from_le_bytes(buf[8..12].try_into().ok()?),
177 string_pool_offset: u32::from_le_bytes(buf[12..16].try_into().ok()?),
178 string_pool_size: u32::from_le_bytes(buf[16..20].try_into().ok()?),
179 entry_table_offset: u32::from_le_bytes(buf[20..24].try_into().ok()?),
180 fst_code_index_offset: u32::from_le_bytes(buf[24..28].try_into().ok()?),
181 fst_code_index_size: u32::from_le_bytes(buf[28..32].try_into().ok()?),
182 fst_word_index_offset: u32::from_le_bytes(buf[32..36].try_into().ok()?),
183 fst_word_index_size: u32::from_le_bytes(buf[36..40].try_into().ok()?),
184 bigram_offset: u32::from_le_bytes(buf[40..44].try_into().ok()?),
185 bigram_size: u32::from_le_bytes(buf[44..48].try_into().ok()?),
186 embedding_offset: u32::from_le_bytes(buf[48..52].try_into().ok()?),
187 embedding_dim: u16::from_le_bytes(buf[52..54].try_into().ok()?),
188 embedding_dtype: buf[54],
189 reserved: buf[55],
190 sha256_of_payload: sha,
191 })
192 }
193}
194
195pub const SHA256_SIZE: usize = 32;
199
200pub const FULL_HEADER_SIZE: usize = HEADER_SIZE + SHA256_SIZE;
202
203#[derive(Copy, Clone, Debug, PartialEq, Eq)]
220pub struct EntryRecord {
221 pub word_offset: u32, pub code_offset: u32, pub log_prior: i16,
224 pub match_type: u8,
225 pub flags: u8,
226 pub raw_freq: u32,
227 pub embedding_offset: u32,
228}
229
230impl EntryRecord {
231 pub fn to_bytes(&self) -> [u8; ENTRY_SIZE] {
233 let mut buf = [0u8; ENTRY_SIZE];
234 let wo = self.word_offset.to_le_bytes();
236 buf[0..3].copy_from_slice(&wo[0..3]);
237 let co = self.code_offset.to_le_bytes();
238 buf[3..6].copy_from_slice(&co[0..3]);
239 buf[6..8].copy_from_slice(&self.log_prior.to_le_bytes());
240 buf[8] = self.match_type;
241 buf[9] = self.flags;
242 buf[10..14].copy_from_slice(&self.raw_freq.to_le_bytes());
243 buf
247 }
248
249 pub fn parse(buf: &[u8; ENTRY_SIZE]) -> Self {
251 let mut wo = [0u8; 4];
252 wo[0..3].copy_from_slice(&buf[0..3]);
253 let word_offset = u32::from_le_bytes(wo);
254 let mut co = [0u8; 4];
255 co[0..3].copy_from_slice(&buf[3..6]);
256 let code_offset = u32::from_le_bytes(co);
257 let log_prior = i16::from_le_bytes([buf[6], buf[7]]);
258 let match_type = buf[8];
259 let flags = buf[9];
260 let raw_freq = u32::from_le_bytes([buf[10], buf[11], buf[12], buf[13]]);
261 EntryRecord {
262 word_offset,
263 code_offset,
264 log_prior,
265 match_type,
266 flags,
267 raw_freq,
268 embedding_offset: 0, }
270 }
271}
272
273pub fn encode_match_type(mt: inputx_scoring::MatchType) -> u8 {
280 match mt {
281 inputx_scoring::MatchType::Exact => 0,
282 inputx_scoring::MatchType::Prefix(_) => 1,
283 inputx_scoring::MatchType::Fuzzy(_) => 2,
284 inputx_scoring::MatchType::Composed { .. } => 3,
285 }
286}
287
288pub fn decode_match_type(b: u8) -> inputx_scoring::MatchType {
291 match b {
292 0 => inputx_scoring::MatchType::Exact,
293 1 => inputx_scoring::MatchType::Prefix(0),
294 2 => inputx_scoring::MatchType::Fuzzy(0),
295 3 => inputx_scoring::MatchType::Composed { bigram_links: 0 },
296 _ => inputx_scoring::MatchType::Exact, }
298}
299
300#[cfg(test)]
301mod tests {
302 use super::*;
303
304 #[test]
305 fn header_size_constants_match_spec() {
306 assert_eq!(HEADER_SIZE, 64);
307 assert_eq!(SHA256_SIZE, 32);
308 assert_eq!(FULL_HEADER_SIZE, 96);
309 assert_eq!(ENTRY_SIZE, 16);
310 }
311
312 #[test]
313 fn header_round_trip_preserves_all_fields() {
314 let h = Header {
315 magic: MAGIC,
316 format_version: 1,
317 engine_kind: 2,
318 flags: 0x0007,
319 entry_count: 237_842,
320 string_pool_offset: 96,
321 string_pool_size: 2_097_152,
322 entry_table_offset: 2_097_248,
323 fst_code_index_offset: 5_900_000,
324 fst_code_index_size: 1_048_576,
325 fst_word_index_offset: 6_948_576,
326 fst_word_index_size: 524_288,
327 bigram_offset: 0,
328 bigram_size: 0,
329 embedding_offset: 0,
330 embedding_dim: 0,
331 embedding_dtype: 0,
332 reserved: 0,
333 sha256_of_payload: [0xab; 32],
334 };
335 let bytes = h.to_bytes();
336 let mut full = [0u8; FULL_HEADER_SIZE];
339 full[..HEADER_SIZE].copy_from_slice(&bytes);
340 full[HEADER_SIZE..].copy_from_slice(&h.sha256_of_payload);
341 let h2 = Header::parse(&full).expect("parse");
342 assert_eq!(h2, h);
343 }
344
345 #[test]
346 fn header_rejects_wrong_magic() {
347 let mut buf = [0u8; FULL_HEADER_SIZE];
348 buf[0..4].copy_from_slice(b"WHAT");
349 assert!(Header::parse(&buf).is_none());
350 }
351
352 #[test]
353 fn header_rejects_short_buffer() {
354 let buf = [0u8; HEADER_SIZE]; assert!(Header::parse(&buf).is_none());
356 }
357
358 #[test]
359 fn version_byte_round_trip() {
360 assert_eq!(Version::from_byte(1), Some(Version::V1));
361 assert_eq!(Version::from_byte(2), None);
362 assert_eq!(Version::from_byte(0), None);
363 }
364
365 #[test]
366 fn engine_kind_byte_round_trip() {
367 for k in [
368 EngineKind::Pinyin,
369 EngineKind::Wubi,
370 EngineKind::NihongoJukugo,
371 EngineKind::NihongoKanji,
372 EngineKind::Other,
373 ] {
374 assert_eq!(EngineKind::from_byte(k as u8), Some(k));
375 }
376 assert_eq!(EngineKind::from_byte(99), None);
377 }
378
379 #[test]
380 fn entry_round_trip_preserves_fields() {
381 let e = EntryRecord {
382 word_offset: 0x12_3456,
383 code_offset: 0xab_cdef,
384 log_prior: -42,
385 match_type: 1,
386 flags: EntryFlags::BLACKLIST | EntryFlags::USER_ADDED,
387 raw_freq: 0xdead_beef,
388 embedding_offset: 0,
389 };
390 let bytes = e.to_bytes();
391 assert_eq!(bytes.len(), ENTRY_SIZE);
392 let e2 = EntryRecord::parse(&bytes);
393 assert_eq!(e2, e);
394 }
395
396 #[test]
397 fn entry_u24_offsets_truncate_at_24_bits() {
398 let e = EntryRecord {
403 word_offset: 0xFF_FFFF,
404 code_offset: 0,
405 log_prior: 0,
406 match_type: 0,
407 flags: 0,
408 raw_freq: 0,
409 embedding_offset: 0,
410 };
411 let bytes = e.to_bytes();
412 let e2 = EntryRecord::parse(&bytes);
413 assert_eq!(e2.word_offset, 0xFF_FFFF);
414 }
415
416 #[test]
417 fn match_type_round_trip() {
418 for mt in [
419 inputx_scoring::MatchType::Exact,
420 inputx_scoring::MatchType::Prefix(800),
421 inputx_scoring::MatchType::Fuzzy(300),
422 inputx_scoring::MatchType::Composed { bigram_links: 2 },
423 ] {
424 let b = encode_match_type(mt);
425 let back = decode_match_type(b);
426 assert_eq!(
428 core::mem::discriminant(&back),
429 core::mem::discriminant(&mt),
430 "variant {mt:?} → byte {b} → {back:?} (variant must match)"
431 );
432 }
433 assert_eq!(decode_match_type(99), inputx_scoring::MatchType::Exact);
435 }
436}