Skip to main content

kiri_engine/dictionary/
lexicon.rs

1//! Lexicon: combines DoubleArrayTrie, WordIdTable, WordParameterList, and WordInfoList
2//! into a single dictionary lookup interface.
3
4use std::sync::Arc;
5
6use crate::dictionary::trie::DoubleArrayTrie;
7use crate::dictionary::word_id_table::WordIdTable;
8use crate::dictionary::word_info::WordInfoList;
9use crate::dictionary::word_params::WordParameterList;
10use crate::types::{WordInfo, WordParameter};
11
12/// A single word match from a trie lookup.
13#[derive(Debug)]
14pub struct WordMatch {
15    pub word_ids: Vec<i32>,
16    pub length: usize,
17}
18
19pub struct Lexicon {
20    pub trie: Arc<DoubleArrayTrie>,
21    pub word_id_table: WordIdTable,
22    pub word_params: WordParameterList,
23    pub word_infos: WordInfoList,
24}
25
26impl Lexicon {
27    /// Read a Lexicon from raw bytes at the given offset.
28    /// Returns the lexicon and total bytes consumed (excluding word_infos trailing data).
29    pub fn from_bytes(data: &[u8], offset: usize, has_synonym_gid: bool) -> (Self, usize) {
30        let start_offset = offset;
31        let mut pos = offset;
32
33        // 1. Trie
34        let (trie, trie_bytes) = DoubleArrayTrie::from_bytes(data, pos);
35        pos += trie_bytes;
36
37        // 2. WordIdTable
38        let word_id_table = WordIdTable::new(data, pos);
39        pos += word_id_table.storage_size();
40
41        // 3. WordParameterList
42        let word_params = WordParameterList::new(data, pos);
43        pos += word_params.storage_size();
44
45        // 4. WordInfoList (variable-length, just stores offset)
46        let word_infos = WordInfoList::new(pos, word_params.size(), has_synonym_gid);
47
48        let bytes_read = pos - start_offset;
49        (
50            Self {
51                trie: Arc::new(trie),
52                word_id_table,
53                word_params,
54                word_infos,
55            },
56            bytes_read,
57        )
58    }
59
60    /// Create a Lexicon that shares a pre-built trie via `Arc`.
61    /// Skips trie parsing — only reads WordIdTable, WordParameterList, WordInfoList.
62    pub fn from_shared(
63        data: &[u8],
64        offset: usize,
65        trie: Arc<DoubleArrayTrie>,
66        trie_bytes: usize,
67        has_synonym_gid: bool,
68    ) -> (Self, usize) {
69        let start_offset = offset;
70        let mut pos = offset + trie_bytes;
71
72        let word_id_table = WordIdTable::new(data, pos);
73        pos += word_id_table.storage_size();
74
75        let word_params = WordParameterList::new(data, pos);
76        pos += word_params.storage_size();
77
78        let word_infos = WordInfoList::new(pos, word_params.size(), has_synonym_gid);
79
80        let bytes_read = pos - start_offset;
81        (
82            Self {
83                trie,
84                word_id_table,
85                word_params,
86                word_infos,
87            },
88            bytes_read,
89        )
90    }
91
92    /// Look up all words that are prefixes of the input bytes starting at offset.
93    pub fn lookup(&self, data: &[u8], key: &[u8], offset: usize, limit: usize) -> Vec<WordMatch> {
94        let trie_matches = self.trie.common_prefix_search(key, offset, limit);
95        let mut results = Vec::new();
96
97        for m in &trie_matches {
98            let word_ids = self.word_id_table.get_word_ids(data, m.value, 0);
99            if !word_ids.is_empty() {
100                results.push(WordMatch {
101                    word_ids,
102                    length: m.length,
103                });
104            }
105        }
106
107        results
108    }
109
110    /// Get left connection ID for a word.
111    #[inline]
112    pub fn get_left_id(&self, data: &[u8], word_index: u32) -> i16 {
113        self.word_params.get_left_id(data, word_index)
114    }
115
116    /// Get right connection ID for a word.
117    #[inline]
118    pub fn get_right_id(&self, data: &[u8], word_index: u32) -> i16 {
119        self.word_params.get_right_id(data, word_index)
120    }
121
122    /// Get cost for a word.
123    #[inline]
124    pub fn get_cost(&self, data: &[u8], word_index: u32) -> i16 {
125        self.word_params.get_cost(data, word_index)
126    }
127
128    /// Get all connection parameters for a word.
129    pub fn get_parameters(&self, data: &[u8], word_index: u32) -> WordParameter {
130        self.word_params.get_parameters(data, word_index)
131    }
132
133    /// Get word info for a word index.
134    pub fn get_word_info(&self, data: &[u8], word_index: u32) -> WordInfo {
135        self.word_infos.get_word_info(data, word_index)
136    }
137
138    /// Number of words in this lexicon.
139    pub fn size(&self) -> u32 {
140        self.word_params.size()
141    }
142}