Skip to main content

kiri_engine/
types.rs

1//! Pure Rust types for Kiri's dictionary structures (no NAPI dependency).
2
3/// Dictionary version constants (matching Sudachi binary format).
4pub mod dictionary_version {
5    pub const SYSTEM_DICT_VERSION_1: i64 = 0x7366d3f18cdc7f5d_u64 as i64;
6    pub const SYSTEM_DICT_VERSION_2: i64 = 0x7366d3f18cdc7f5e_u64 as i64;
7    pub const USER_DICT_VERSION_1: i64 = 0xa50f31188cdc7f5d_u64 as i64;
8    pub const USER_DICT_VERSION_2: i64 = 0xa50f31188cdc7f5e_u64 as i64;
9    pub const USER_DICT_VERSION_3: i64 = 0xa50f31188cdc7f5f_u64 as i64;
10}
11
12/// Header byte size.
13pub const DICTIONARY_HEADER_SIZE: usize = 272;
14
15/// Binary dictionary header (272 bytes).
16#[derive(Debug, Clone)]
17pub struct DictionaryHeader {
18    pub version: i64,
19    pub create_time: i64,
20    pub description: String,
21}
22
23/// Part-of-speech tag: 6-level hierarchy.
24#[derive(Debug, Clone)]
25pub struct POS {
26    pub tags: [String; 6],
27}
28
29/// Connection parameters for a word.
30#[derive(Debug, Clone, Copy, Default)]
31pub struct WordParameter {
32    pub left_id: i16,
33    pub right_id: i16,
34    pub cost: i16,
35}
36
37/// Connection cost matrix (row-major Int16).
38#[derive(Debug)]
39pub struct ConnectionMatrix {
40    pub left_size: usize,
41    pub right_size: usize,
42    /// Offset into the mmap where the matrix data starts.
43    pub data_offset: usize,
44}
45
46/// Bundled connection cost lookup: matrix metadata + backing data slice.
47/// Used by the lattice and tokenizer hot path to avoid passing two separate arguments.
48#[derive(Debug, Clone, Copy)]
49pub struct ConnectionCosts<'a> {
50    pub left_size: usize,
51    pub right_size: usize,
52    pub data_offset: usize,
53    pub data: &'a [u8],
54}
55
56impl<'a> ConnectionCosts<'a> {
57    /// Create from a `ConnectionMatrix` and its backing data.
58    #[inline]
59    pub fn new(matrix: &ConnectionMatrix, data: &'a [u8]) -> Self {
60        Self {
61            left_size: matrix.left_size,
62            right_size: matrix.right_size,
63            data_offset: matrix.data_offset,
64            data,
65        }
66    }
67
68    /// Look up connection cost between a left ID and right ID.
69    /// Returns INHIBITED_CONNECTION (0x7fff) if the index is out of bounds.
70    #[inline]
71    pub fn get_cost(&self, left_id: i16, right_id: i16) -> i16 {
72        let index = right_id as u16 as usize * self.left_size + left_id as u16 as usize;
73        let byte_offset = self.data_offset + index * 2;
74        match (self.data.get(byte_offset), self.data.get(byte_offset + 1)) {
75            (Some(&lo), Some(&hi)) => i16::from_le_bytes([lo, hi]),
76            _ => 0x7fff, // inhibited
77        }
78    }
79}
80
81/// Grammar: POS inventory + connection costs.
82#[derive(Debug)]
83pub struct Grammar {
84    pub pos_list: Vec<POS>,
85    pub connection: ConnectionMatrix,
86    pub bos_parameter: WordParameter,
87    pub eos_parameter: WordParameter,
88}
89
90/// Word metadata from the dictionary.
91#[derive(Debug, Clone)]
92pub struct WordInfo {
93    pub surface: String,
94    pub headword_length: u16,
95    pub pos_id: i16,
96    pub normalized_form: String,
97    pub dictionary_form_word_id: i32,
98    pub dictionary_form: String,
99    pub reading_form: String,
100    pub a_unit_split: Vec<i32>,
101    pub b_unit_split: Vec<i32>,
102    pub word_structure: Vec<i32>,
103    pub synonym_gids: Vec<i32>,
104}
105
106// ---- Word ID packing utilities ----
107
108const WORD_ID_MASK: i32 = 0x0fff_ffff;
109const DICT_ID_SHIFT: u32 = 28;
110
111/// Extract the dictionary ID from a packed word ID.
112#[inline]
113pub fn get_dictionary_id(word_id: i32) -> i32 {
114    ((word_id as u32) >> DICT_ID_SHIFT) as i32 & 0xf
115}
116
117/// Extract the internal word index from a packed word ID.
118#[inline]
119pub fn get_word_index(word_id: i32) -> i32 {
120    word_id & WORD_ID_MASK
121}
122
123/// Pack a dictionary ID and word index into a single word ID.
124#[inline]
125pub fn pack_word_id(dict_id: i32, word_index: i32) -> i32 {
126    (((dict_id & 0xf) as u32) << DICT_ID_SHIFT) as i32 | (word_index & WORD_ID_MASK)
127}
128
129/// Check whether a version represents a system dictionary.
130pub fn is_system_dictionary(version: i64) -> bool {
131    version == dictionary_version::SYSTEM_DICT_VERSION_1
132        || version == dictionary_version::SYSTEM_DICT_VERSION_2
133}
134
135/// Check whether a version represents a user dictionary.
136pub fn is_user_dictionary(version: i64) -> bool {
137    version == dictionary_version::USER_DICT_VERSION_1
138        || version == dictionary_version::USER_DICT_VERSION_2
139        || version == dictionary_version::USER_DICT_VERSION_3
140}
141
142/// Check whether the version supports synonym group IDs.
143pub fn has_synonym_group_ids(version: i64) -> bool {
144    version == dictionary_version::SYSTEM_DICT_VERSION_2
145        || version == dictionary_version::USER_DICT_VERSION_2
146        || version == dictionary_version::USER_DICT_VERSION_3
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn test_pack_unpack_word_id() {
155        let dict_id = 3;
156        let word_index = 12345;
157        let packed = pack_word_id(dict_id, word_index);
158        assert_eq!(get_dictionary_id(packed), dict_id);
159        assert_eq!(get_word_index(packed), word_index);
160    }
161
162    #[test]
163    fn test_pack_word_id_zero() {
164        let packed = pack_word_id(0, 0);
165        assert_eq!(get_dictionary_id(packed), 0);
166        assert_eq!(get_word_index(packed), 0);
167    }
168
169    #[test]
170    fn test_pack_word_id_max() {
171        let packed = pack_word_id(15, WORD_ID_MASK);
172        assert_eq!(get_dictionary_id(packed), 15);
173        assert_eq!(get_word_index(packed), WORD_ID_MASK);
174    }
175
176    #[test]
177    fn test_version_checks() {
178        assert!(is_system_dictionary(
179            dictionary_version::SYSTEM_DICT_VERSION_1
180        ));
181        assert!(is_system_dictionary(
182            dictionary_version::SYSTEM_DICT_VERSION_2
183        ));
184        assert!(!is_system_dictionary(
185            dictionary_version::USER_DICT_VERSION_1
186        ));
187
188        assert!(is_user_dictionary(dictionary_version::USER_DICT_VERSION_1));
189        assert!(is_user_dictionary(dictionary_version::USER_DICT_VERSION_2));
190        assert!(is_user_dictionary(dictionary_version::USER_DICT_VERSION_3));
191        assert!(!is_user_dictionary(
192            dictionary_version::SYSTEM_DICT_VERSION_1
193        ));
194
195        assert!(has_synonym_group_ids(
196            dictionary_version::SYSTEM_DICT_VERSION_2
197        ));
198        assert!(!has_synonym_group_ids(
199            dictionary_version::SYSTEM_DICT_VERSION_1
200        ));
201    }
202}