Skip to main content

datacortex_core/model/
json_model.rs

1//! JsonModel -- JSON structure-aware context model.
2//!
3//! Phase 4: Tracks JSON parsing state and provides structure-aware predictions.
4//! Gives the mixer specialized weight sets for different JSON contexts:
5//! - Inside a key vs inside a value
6//! - String vs number vs boolean vs null
7//! - Array index position
8//! - After colon vs after comma
9//!
10//! This model provides both a prediction and a JSON state byte that other
11//! models can use as additional mixer context.
12
13use crate::state::context_map::ContextMap;
14use crate::state::state_map::StateMap;
15use crate::state::state_table::StateTable;
16
17/// JSON parser states (simplified for compression context).
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19#[repr(u8)]
20enum JsonState {
21    /// Outside any JSON structure or at top level.
22    TopLevel = 0,
23    /// Inside an object, expecting key or closing brace.
24    ObjectKey = 1,
25    /// After colon, expecting value.
26    ObjectValue = 2,
27    /// Inside an array, expecting value or closing bracket.
28    ArrayValue = 3,
29    /// Inside a quoted string (key or value).
30    String = 4,
31    /// Inside a number literal.
32    Number = 5,
33    /// Inside a keyword (true, false, null).
34    Keyword = 6,
35}
36
37/// JSON structure-aware context model.
38pub struct JsonModel {
39    /// Context map for JSON-state-aware prediction.
40    cmap: ContextMap,
41    /// State map.
42    smap: StateMap,
43    /// Current JSON parse state.
44    state: JsonState,
45    /// Whether we're in a key string (vs value string).
46    in_key: bool,
47    /// Nesting depth (quantized).
48    depth: u8,
49    /// Hash of the current key (for key->value correlation).
50    key_hash: u32,
51    /// Previous byte for state tracking.
52    prev_byte: u8,
53    /// Whether previous byte was backslash (for escape handling).
54    escaped: bool,
55    /// Last state for update.
56    last_state: u8,
57    /// Last hash for update.
58    last_hash: u32,
59}
60
61impl JsonModel {
62    /// Create a JSON model with default 8MB ContextMap.
63    pub fn new() -> Self {
64        Self::with_size(1 << 23) // 8MB
65    }
66
67    /// Create a JSON model with a custom ContextMap size (in bytes).
68    pub fn with_size(cmap_size: usize) -> Self {
69        JsonModel {
70            cmap: ContextMap::new(cmap_size),
71            smap: StateMap::new(),
72            state: JsonState::TopLevel,
73            in_key: false,
74            depth: 0,
75            key_hash: 0,
76            prev_byte: 0,
77            escaped: false,
78            last_state: 0,
79            last_hash: 0,
80        }
81    }
82
83    /// Predict based on JSON structure context.
84    /// `c0`: partial byte (1-255).
85    /// `bpos`: bit position (0-7).
86    /// `c1`: last completed byte.
87    #[inline]
88    pub fn predict(&mut self, c0: u32, bpos: u8, c1: u8) -> u32 {
89        if bpos == 0 {
90            self.update_json_state(c1);
91        }
92
93        // Context hash: json_state(3b) + in_key(1b) + depth_q(2b) + c0(8b)
94        // For string contexts, also mix in key_hash
95        let mut h: u32 = 0xCAFEBABE;
96        h = h.wrapping_mul(0x01000193) ^ (self.state as u32);
97        h = h.wrapping_mul(0x01000193) ^ (self.in_key as u32);
98        h = h.wrapping_mul(0x01000193) ^ (self.depth.min(3) as u32);
99        h = h.wrapping_mul(0x01000193) ^ (c0 & 0xFF);
100
101        // For values, mix in key hash so values associated with the same key
102        // share a context (e.g., all "name" values cluster together)
103        if self.state == JsonState::ObjectValue || self.state == JsonState::String {
104            h = h.wrapping_mul(0x01000193) ^ self.key_hash;
105        }
106
107        let state = self.cmap.get(h);
108        self.last_state = state;
109        self.last_hash = h;
110        self.smap.predict(state)
111    }
112
113    /// Update after observing bit.
114    #[inline]
115    pub fn update(&mut self, bit: u8) {
116        self.smap.update(self.last_state, bit);
117        let new_state = StateTable::next(self.last_state, bit);
118        self.cmap.set(self.last_hash, new_state);
119    }
120
121    /// Return the current JSON state as a byte for mixer context.
122    /// Returns 0-15 encoding the JSON parser state.
123    #[inline]
124    pub fn json_state_byte(&self) -> u8 {
125        let state_bits = self.state as u8 & 0x7;
126        let key_bit = if self.in_key { 8 } else { 0 };
127        state_bits | key_bit
128    }
129
130    /// Update JSON parse state based on the last completed byte.
131    fn update_json_state(&mut self, c1: u8) {
132        // Handle string escaping
133        if self.state == JsonState::String {
134            if self.escaped {
135                self.escaped = false;
136                // Hash escaped char into key_hash if in key
137                if self.in_key {
138                    self.key_hash = self.key_hash.wrapping_mul(0x01000193) ^ c1 as u32;
139                }
140                self.prev_byte = c1;
141                return;
142            }
143            if c1 == b'\\' {
144                self.escaped = true;
145                self.prev_byte = c1;
146                return;
147            }
148            if c1 == b'"' {
149                // End of string
150                if self.in_key {
151                    // Key finished — next should be colon then value
152                    self.state = JsonState::ObjectKey; // waiting for colon
153                } else {
154                    // Value string finished
155                    self.state = JsonState::ObjectValue; // will transition on comma/brace
156                }
157                self.prev_byte = c1;
158                return;
159            }
160            // Regular string character — hash into key hash if in key
161            if self.in_key {
162                self.key_hash = self.key_hash.wrapping_mul(0x01000193) ^ c1 as u32;
163            }
164            self.prev_byte = c1;
165            return;
166        }
167
168        // Not in string — track structural characters
169        match c1 {
170            b'{' => {
171                self.state = JsonState::ObjectKey;
172                self.depth = self.depth.saturating_add(1);
173            }
174            b'[' => {
175                self.state = JsonState::ArrayValue;
176                self.depth = self.depth.saturating_add(1);
177            }
178            b'}' | b']' => {
179                self.depth = self.depth.saturating_sub(1);
180                // Pop back to parent context
181                self.state = if self.depth > 0 {
182                    JsonState::ObjectValue // could be either, but ObjectValue is safe
183                } else {
184                    JsonState::TopLevel
185                };
186            }
187            b'"' => {
188                // Starting a string
189                self.state = JsonState::String;
190                // Determine if this is a key or value
191                // Key if: after '{', after ',', or if prev non-ws was '{' or ','
192                self.in_key = matches!(self.prev_significant_context(), b'{' | b',');
193                if self.in_key {
194                    self.key_hash = 0; // reset for new key
195                }
196            }
197            b':' => {
198                self.state = JsonState::ObjectValue;
199            }
200            b',' => {
201                // After comma, context depends on container
202                // Could be in object (next key) or array (next value)
203                // We'll set ObjectKey and let the quote detection fix it
204                self.state = JsonState::ObjectKey;
205            }
206            b'0'..=b'9' | b'-' => {
207                if self.state != JsonState::Number {
208                    self.state = JsonState::Number;
209                }
210            }
211            b't' | b'f' | b'n' => {
212                if self.state != JsonState::Keyword && self.state != JsonState::String {
213                    self.state = JsonState::Keyword;
214                }
215            }
216            _ => {
217                // Whitespace or other — don't change state
218            }
219        }
220
221        self.prev_byte = c1;
222    }
223
224    /// Get the previous significant (non-whitespace) byte context.
225    /// Simplified: just return prev_byte since we don't store history.
226    #[inline]
227    fn prev_significant_context(&self) -> u8 {
228        // Skip whitespace in prev_byte
229        if self.prev_byte.is_ascii_whitespace() {
230            // Can't look further back, assume comma context
231            b','
232        } else {
233            self.prev_byte
234        }
235    }
236}
237
238impl Default for JsonModel {
239    fn default() -> Self {
240        Self::new()
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn initial_prediction_balanced() {
250        let mut jm = JsonModel::new();
251        let p = jm.predict(1, 0, 0);
252        assert_eq!(p, 2048);
253    }
254
255    #[test]
256    fn predictions_in_range() {
257        let mut jm = JsonModel::new();
258        for c in b"{\"name\":\"Alice\",\"age\":30}" {
259            for bpos in 0..8u8 {
260                let bit = (c >> (7 - bpos)) & 1;
261                let p = jm.predict(1, bpos, if bpos == 0 { *c } else { 0 });
262                assert!((1..=4095).contains(&p));
263                jm.update(bit);
264            }
265        }
266    }
267
268    #[test]
269    fn json_state_changes() {
270        let mut jm = JsonModel::new();
271        // Feed opening brace
272        jm.predict(1, 0, b'{');
273        assert_ne!(jm.state, JsonState::TopLevel);
274    }
275}