use core::convert::TryInto;
pub const MAGIC: [u8; 4] = *b"IDFv";
pub const HEADER_SIZE: usize = 64;
pub const ENTRY_SIZE: usize = 16;
#[repr(u8)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Version {
V1 = 1,
}
impl Version {
pub fn from_byte(b: u8) -> Option<Self> {
match b {
1 => Some(Self::V1),
_ => None,
}
}
}
#[repr(u8)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum EngineKind {
Pinyin = 0,
Wubi = 1,
NihongoJukugo = 2,
NihongoKanji = 3,
Other = 4,
}
impl EngineKind {
pub fn from_byte(b: u8) -> Option<Self> {
match b {
0 => Some(Self::Pinyin),
1 => Some(Self::Wubi),
2 => Some(Self::NihongoJukugo),
3 => Some(Self::NihongoKanji),
4 => Some(Self::Other),
_ => None,
}
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
pub struct EntryFlags(pub u8);
impl EntryFlags {
pub const BLACKLIST: u8 = 1 << 0;
pub const CURATED_OVERRIDE: u8 = 1 << 1;
pub const USER_ADDED: u8 = 1 << 2;
pub const ENGINE_TAG_MASK: u8 = 0b1110_0000;
pub const ENGINE_TAG_SHIFT: u8 = 5;
pub fn is_blacklisted(self) -> bool { self.0 & Self::BLACKLIST != 0 }
pub fn is_curated_override(self) -> bool { self.0 & Self::CURATED_OVERRIDE != 0 }
pub fn is_user_added(self) -> bool { self.0 & Self::USER_ADDED != 0 }
pub fn engine_tag(self) -> u8 {
(self.0 & Self::ENGINE_TAG_MASK) >> Self::ENGINE_TAG_SHIFT
}
pub fn with_engine_tag(self, tag: u8) -> Self {
let cleared = self.0 & !Self::ENGINE_TAG_MASK;
Self(cleared | ((tag & 0b111) << Self::ENGINE_TAG_SHIFT))
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Header {
pub magic: [u8; 4],
pub format_version: u8,
pub engine_kind: u8,
pub flags: u16,
pub entry_count: u32,
pub string_pool_offset: u32,
pub string_pool_size: u32,
pub entry_table_offset: u32,
pub fst_code_index_offset: u32,
pub fst_code_index_size: u32,
pub fst_word_index_offset: u32,
pub fst_word_index_size: u32,
pub bigram_offset: u32,
pub bigram_size: u32,
pub embedding_offset: u32,
pub embedding_dim: u16,
pub embedding_dtype: u8,
pub reserved: u8,
pub sha256_of_payload: [u8; 32],
}
impl Header {
pub fn to_bytes(&self) -> [u8; HEADER_SIZE] {
let mut buf = [0u8; HEADER_SIZE];
buf[0..4].copy_from_slice(&self.magic);
buf[4] = self.format_version;
buf[5] = self.engine_kind;
buf[6..8].copy_from_slice(&self.flags.to_le_bytes());
buf[8..12].copy_from_slice(&self.entry_count.to_le_bytes());
buf[12..16].copy_from_slice(&self.string_pool_offset.to_le_bytes());
buf[16..20].copy_from_slice(&self.string_pool_size.to_le_bytes());
buf[20..24].copy_from_slice(&self.entry_table_offset.to_le_bytes());
buf[24..28].copy_from_slice(&self.fst_code_index_offset.to_le_bytes());
buf[28..32].copy_from_slice(&self.fst_code_index_size.to_le_bytes());
buf[32..36].copy_from_slice(&self.fst_word_index_offset.to_le_bytes());
buf[36..40].copy_from_slice(&self.fst_word_index_size.to_le_bytes());
buf[40..44].copy_from_slice(&self.bigram_offset.to_le_bytes());
buf[44..48].copy_from_slice(&self.bigram_size.to_le_bytes());
buf[48..52].copy_from_slice(&self.embedding_offset.to_le_bytes());
buf[52..54].copy_from_slice(&self.embedding_dim.to_le_bytes());
buf[54] = self.embedding_dtype;
buf[55] = self.reserved;
buf
}
pub fn parse(buf: &[u8]) -> Option<Self> {
if buf.len() < HEADER_SIZE + 32 { return None; }
if buf[0..4] != MAGIC { return None; }
let mut sha = [0u8; 32];
sha.copy_from_slice(&buf[HEADER_SIZE..HEADER_SIZE + 32]);
Some(Self {
magic: MAGIC,
format_version: buf[4],
engine_kind: buf[5],
flags: u16::from_le_bytes(buf[6..8].try_into().ok()?),
entry_count: u32::from_le_bytes(buf[8..12].try_into().ok()?),
string_pool_offset: u32::from_le_bytes(buf[12..16].try_into().ok()?),
string_pool_size: u32::from_le_bytes(buf[16..20].try_into().ok()?),
entry_table_offset: u32::from_le_bytes(buf[20..24].try_into().ok()?),
fst_code_index_offset: u32::from_le_bytes(buf[24..28].try_into().ok()?),
fst_code_index_size: u32::from_le_bytes(buf[28..32].try_into().ok()?),
fst_word_index_offset: u32::from_le_bytes(buf[32..36].try_into().ok()?),
fst_word_index_size: u32::from_le_bytes(buf[36..40].try_into().ok()?),
bigram_offset: u32::from_le_bytes(buf[40..44].try_into().ok()?),
bigram_size: u32::from_le_bytes(buf[44..48].try_into().ok()?),
embedding_offset: u32::from_le_bytes(buf[48..52].try_into().ok()?),
embedding_dim: u16::from_le_bytes(buf[52..54].try_into().ok()?),
embedding_dtype: buf[54],
reserved: buf[55],
sha256_of_payload: sha,
})
}
}
pub const SHA256_SIZE: usize = 32;
pub const FULL_HEADER_SIZE: usize = HEADER_SIZE + SHA256_SIZE;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct EntryRecord {
pub word_offset: u32, pub code_offset: u32, pub log_prior: i16,
pub match_type: u8,
pub flags: u8,
pub raw_freq: u32,
pub embedding_offset: u32,
}
impl EntryRecord {
pub fn to_bytes(&self) -> [u8; ENTRY_SIZE] {
let mut buf = [0u8; ENTRY_SIZE];
let wo = self.word_offset.to_le_bytes();
buf[0..3].copy_from_slice(&wo[0..3]);
let co = self.code_offset.to_le_bytes();
buf[3..6].copy_from_slice(&co[0..3]);
buf[6..8].copy_from_slice(&self.log_prior.to_le_bytes());
buf[8] = self.match_type;
buf[9] = self.flags;
buf[10..14].copy_from_slice(&self.raw_freq.to_le_bytes());
buf
}
pub fn parse(buf: &[u8; ENTRY_SIZE]) -> Self {
let mut wo = [0u8; 4];
wo[0..3].copy_from_slice(&buf[0..3]);
let word_offset = u32::from_le_bytes(wo);
let mut co = [0u8; 4];
co[0..3].copy_from_slice(&buf[3..6]);
let code_offset = u32::from_le_bytes(co);
let log_prior = i16::from_le_bytes([buf[6], buf[7]]);
let match_type = buf[8];
let flags = buf[9];
let raw_freq = u32::from_le_bytes([buf[10], buf[11], buf[12], buf[13]]);
EntryRecord {
word_offset,
code_offset,
log_prior,
match_type,
flags,
raw_freq,
embedding_offset: 0, }
}
}
pub fn encode_match_type(mt: inputx_scoring::MatchType) -> u8 {
match mt {
inputx_scoring::MatchType::Exact => 0,
inputx_scoring::MatchType::Prefix(_) => 1,
inputx_scoring::MatchType::Fuzzy(_) => 2,
inputx_scoring::MatchType::Composed { .. } => 3,
}
}
pub fn decode_match_type(b: u8) -> inputx_scoring::MatchType {
match b {
0 => inputx_scoring::MatchType::Exact,
1 => inputx_scoring::MatchType::Prefix(0),
2 => inputx_scoring::MatchType::Fuzzy(0),
3 => inputx_scoring::MatchType::Composed { bigram_links: 0 },
_ => inputx_scoring::MatchType::Exact, }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn header_size_constants_match_spec() {
assert_eq!(HEADER_SIZE, 64);
assert_eq!(SHA256_SIZE, 32);
assert_eq!(FULL_HEADER_SIZE, 96);
assert_eq!(ENTRY_SIZE, 16);
}
#[test]
fn header_round_trip_preserves_all_fields() {
let h = Header {
magic: MAGIC,
format_version: 1,
engine_kind: 2,
flags: 0x0007,
entry_count: 237_842,
string_pool_offset: 96,
string_pool_size: 2_097_152,
entry_table_offset: 2_097_248,
fst_code_index_offset: 5_900_000,
fst_code_index_size: 1_048_576,
fst_word_index_offset: 6_948_576,
fst_word_index_size: 524_288,
bigram_offset: 0,
bigram_size: 0,
embedding_offset: 0,
embedding_dim: 0,
embedding_dtype: 0,
reserved: 0,
sha256_of_payload: [0xab; 32],
};
let bytes = h.to_bytes();
let mut full = [0u8; FULL_HEADER_SIZE];
full[..HEADER_SIZE].copy_from_slice(&bytes);
full[HEADER_SIZE..].copy_from_slice(&h.sha256_of_payload);
let h2 = Header::parse(&full).expect("parse");
assert_eq!(h2, h);
}
#[test]
fn header_rejects_wrong_magic() {
let mut buf = [0u8; FULL_HEADER_SIZE];
buf[0..4].copy_from_slice(b"WHAT");
assert!(Header::parse(&buf).is_none());
}
#[test]
fn header_rejects_short_buffer() {
let buf = [0u8; HEADER_SIZE]; assert!(Header::parse(&buf).is_none());
}
#[test]
fn version_byte_round_trip() {
assert_eq!(Version::from_byte(1), Some(Version::V1));
assert_eq!(Version::from_byte(2), None);
assert_eq!(Version::from_byte(0), None);
}
#[test]
fn engine_kind_byte_round_trip() {
for k in [
EngineKind::Pinyin,
EngineKind::Wubi,
EngineKind::NihongoJukugo,
EngineKind::NihongoKanji,
EngineKind::Other,
] {
assert_eq!(EngineKind::from_byte(k as u8), Some(k));
}
assert_eq!(EngineKind::from_byte(99), None);
}
#[test]
fn entry_round_trip_preserves_fields() {
let e = EntryRecord {
word_offset: 0x12_3456,
code_offset: 0xab_cdef,
log_prior: -42,
match_type: 1,
flags: EntryFlags::BLACKLIST | EntryFlags::USER_ADDED,
raw_freq: 0xdead_beef,
embedding_offset: 0,
};
let bytes = e.to_bytes();
assert_eq!(bytes.len(), ENTRY_SIZE);
let e2 = EntryRecord::parse(&bytes);
assert_eq!(e2, e);
}
#[test]
fn entry_u24_offsets_truncate_at_24_bits() {
let e = EntryRecord {
word_offset: 0xFF_FFFF,
code_offset: 0,
log_prior: 0,
match_type: 0,
flags: 0,
raw_freq: 0,
embedding_offset: 0,
};
let bytes = e.to_bytes();
let e2 = EntryRecord::parse(&bytes);
assert_eq!(e2.word_offset, 0xFF_FFFF);
}
#[test]
fn match_type_round_trip() {
for mt in [
inputx_scoring::MatchType::Exact,
inputx_scoring::MatchType::Prefix(800),
inputx_scoring::MatchType::Fuzzy(300),
inputx_scoring::MatchType::Composed { bigram_links: 2 },
] {
let b = encode_match_type(mt);
let back = decode_match_type(b);
assert_eq!(
core::mem::discriminant(&back),
core::mem::discriminant(&mt),
"variant {mt:?} → byte {b} → {back:?} (variant must match)"
);
}
assert_eq!(decode_match_type(99), inputx_scoring::MatchType::Exact);
}
}