Skip to main content

kiri_engine/
shared.rs

1//! SharedDictionary: pre-parsed trie + raw data that can be shared across
2//! multiple tokenizer instances via `Arc`, avoiding ~150 MB per-tokenizer
3//! trie duplication.
4
5use std::ops::Deref;
6use std::sync::Arc;
7
8use memmap2::Mmap;
9
10use crate::dictionary::grammar::read_grammar;
11use crate::dictionary::header::read_dictionary_header;
12use crate::dictionary::lexicon::Lexicon;
13use crate::dictionary::trie::DoubleArrayTrie;
14use crate::types::{has_synonym_group_ids, Grammar, DICTIONARY_HEADER_SIZE};
15
16/// Dictionary data that can be either an owned `Vec<u8>` (when mutation was
17/// needed for connection inhibitions) or a memory-mapped file (zero-copy,
18/// OS-managed pages).
19pub enum DictData {
20    Owned(Vec<u8>),
21    Mapped(Mmap),
22}
23
24impl Deref for DictData {
25    type Target = [u8];
26
27    fn deref(&self) -> &[u8] {
28        match self {
29            DictData::Owned(v) => v,
30            DictData::Mapped(m) => m,
31        }
32    }
33}
34
35/// Shared dictionary state that can be cloned cheaply across tokenizers.
36///
37/// Holds the raw dictionary bytes (`Arc<DictData>`) and a pre-parsed
38/// `DoubleArrayTrie` (`Arc<DoubleArrayTrie>`). Creating a new `Lexicon`
39/// from this struct is a 16-byte `Arc::clone` + lightweight offset parsing,
40/// not a 150 MB trie copy.
41pub struct SharedDictionary {
42    data: Arc<DictData>,
43    trie: Arc<DoubleArrayTrie>,
44    trie_bytes: usize,
45    has_synonyms: bool,
46    lexicon_offset: usize,
47}
48
49impl SharedDictionary {
50    /// Build a `SharedDictionary` from owned (already inhibited) dictionary bytes.
51    ///
52    /// Callers must apply `inhibit_connection_in_data` **before** passing `data`.
53    pub fn new(data: Vec<u8>) -> Result<Self, String> {
54        Self::from_dict_data(DictData::Owned(data))
55    }
56
57    /// Build a `SharedDictionary` from a memory-mapped file.
58    ///
59    /// Use this when no connection inhibitions are needed — the OS manages
60    /// the pages and they can be shared / evicted without heap pressure.
61    pub fn from_mmap(mmap: Mmap) -> Result<Self, String> {
62        Self::from_dict_data(DictData::Mapped(mmap))
63    }
64
65    fn from_dict_data(dict_data: DictData) -> Result<Self, String> {
66        let header = read_dictionary_header(&dict_data, 0)?;
67        let has_synonyms = has_synonym_group_ids(header.version);
68
69        let (_, grammar_bytes) = read_grammar(&dict_data, DICTIONARY_HEADER_SIZE)?;
70        let lexicon_offset = DICTIONARY_HEADER_SIZE + grammar_bytes;
71
72        // Parse the trie once; all future tokenizers share it via Arc::clone.
73        let (trie, trie_bytes) = DoubleArrayTrie::from_bytes(&dict_data, lexicon_offset);
74
75        Ok(Self {
76            data: Arc::new(dict_data),
77            trie: Arc::new(trie),
78            trie_bytes,
79            has_synonyms,
80            lexicon_offset,
81        })
82    }
83
84    /// Create a `Lexicon` that shares the pre-built trie.
85    pub fn create_lexicon(&self) -> (Lexicon, usize) {
86        Lexicon::from_shared(
87            &self.data,
88            self.lexicon_offset,
89            Arc::clone(&self.trie),
90            self.trie_bytes,
91            self.has_synonyms,
92        )
93    }
94
95    /// Re-parse the grammar section from the shared data.
96    pub fn create_grammar(&self) -> Result<Grammar, String> {
97        let (grammar, _) = read_grammar(&self.data, DICTIONARY_HEADER_SIZE)?;
98        Ok(grammar)
99    }
100
101    /// Get a reference to the shared data `Arc`.
102    pub fn data(&self) -> &Arc<DictData> {
103        &self.data
104    }
105
106    /// Whether this dictionary version includes synonym group IDs.
107    pub fn has_synonyms(&self) -> bool {
108        self.has_synonyms
109    }
110}