datacortex_core/model/json_model.rs
1//! JsonModel -- JSON structure-aware context model.
2//!
3//! Phase 4: Tracks JSON parsing state and provides structure-aware predictions.
4//! Gives the mixer specialized weight sets for different JSON contexts:
5//! - Inside a key vs inside a value
6//! - String vs number vs boolean vs null
7//! - Array index position
8//! - After colon vs after comma
9//!
10//! This model provides both a prediction and a JSON state byte that other
11//! models can use as additional mixer context.
12
13use crate::state::context_map::ContextMap;
14use crate::state::state_map::StateMap;
15use crate::state::state_table::StateTable;
16
17/// JSON parser states (simplified for compression context).
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19#[repr(u8)]
20enum JsonState {
21 /// Outside any JSON structure or at top level.
22 TopLevel = 0,
23 /// Inside an object, expecting key or closing brace.
24 ObjectKey = 1,
25 /// After colon, expecting value.
26 ObjectValue = 2,
27 /// Inside an array, expecting value or closing bracket.
28 ArrayValue = 3,
29 /// Inside a quoted string (key or value).
30 String = 4,
31 /// Inside a number literal.
32 Number = 5,
33 /// Inside a keyword (true, false, null).
34 Keyword = 6,
35}
36
37/// JSON structure-aware context model.
38pub struct JsonModel {
39 /// Context map for JSON-state-aware prediction.
40 cmap: ContextMap,
41 /// State map.
42 smap: StateMap,
43 /// Current JSON parse state.
44 state: JsonState,
45 /// Whether we're in a key string (vs value string).
46 in_key: bool,
47 /// Nesting depth (quantized).
48 depth: u8,
49 /// Hash of the current key (for key->value correlation).
50 key_hash: u32,
51 /// Previous byte for state tracking.
52 prev_byte: u8,
53 /// Whether previous byte was backslash (for escape handling).
54 escaped: bool,
55 /// Last state for update.
56 last_state: u8,
57 /// Last hash for update.
58 last_hash: u32,
59}
60
61impl JsonModel {
62 /// Create a JSON model with default 8MB ContextMap.
63 pub fn new() -> Self {
64 Self::with_size(1 << 23) // 8MB
65 }
66
67 /// Create a JSON model with a custom ContextMap size (in bytes).
68 pub fn with_size(cmap_size: usize) -> Self {
69 JsonModel {
70 cmap: ContextMap::new(cmap_size),
71 smap: StateMap::new(),
72 state: JsonState::TopLevel,
73 in_key: false,
74 depth: 0,
75 key_hash: 0,
76 prev_byte: 0,
77 escaped: false,
78 last_state: 0,
79 last_hash: 0,
80 }
81 }
82
83 /// Predict based on JSON structure context.
84 /// `c0`: partial byte (1-255).
85 /// `bpos`: bit position (0-7).
86 /// `c1`: last completed byte.
87 #[inline]
88 pub fn predict(&mut self, c0: u32, bpos: u8, c1: u8) -> u32 {
89 if bpos == 0 {
90 self.update_json_state(c1);
91 }
92
93 // Context hash: json_state(3b) + in_key(1b) + depth_q(2b) + c0(8b)
94 // For string contexts, also mix in key_hash
95 let mut h: u32 = 0xCAFEBABE;
96 h = h.wrapping_mul(0x01000193) ^ (self.state as u32);
97 h = h.wrapping_mul(0x01000193) ^ (self.in_key as u32);
98 h = h.wrapping_mul(0x01000193) ^ (self.depth.min(3) as u32);
99 h = h.wrapping_mul(0x01000193) ^ (c0 & 0xFF);
100
101 // For values, mix in key hash so values associated with the same key
102 // share a context (e.g., all "name" values cluster together)
103 if self.state == JsonState::ObjectValue || self.state == JsonState::String {
104 h = h.wrapping_mul(0x01000193) ^ self.key_hash;
105 }
106
107 let state = self.cmap.get(h);
108 self.last_state = state;
109 self.last_hash = h;
110 self.smap.predict(state)
111 }
112
113 /// Update after observing bit.
114 #[inline]
115 pub fn update(&mut self, bit: u8) {
116 self.smap.update(self.last_state, bit);
117 let new_state = StateTable::next(self.last_state, bit);
118 self.cmap.set(self.last_hash, new_state);
119 }
120
121 /// Return the current JSON state as a byte for mixer context.
122 /// Returns 0-15 encoding the JSON parser state.
123 #[inline]
124 pub fn json_state_byte(&self) -> u8 {
125 let state_bits = self.state as u8 & 0x7;
126 let key_bit = if self.in_key { 8 } else { 0 };
127 state_bits | key_bit
128 }
129
130 /// Update JSON parse state based on the last completed byte.
131 fn update_json_state(&mut self, c1: u8) {
132 // Handle string escaping
133 if self.state == JsonState::String {
134 if self.escaped {
135 self.escaped = false;
136 // Hash escaped char into key_hash if in key
137 if self.in_key {
138 self.key_hash = self.key_hash.wrapping_mul(0x01000193) ^ c1 as u32;
139 }
140 self.prev_byte = c1;
141 return;
142 }
143 if c1 == b'\\' {
144 self.escaped = true;
145 self.prev_byte = c1;
146 return;
147 }
148 if c1 == b'"' {
149 // End of string
150 if self.in_key {
151 // Key finished — next should be colon then value
152 self.state = JsonState::ObjectKey; // waiting for colon
153 } else {
154 // Value string finished
155 self.state = JsonState::ObjectValue; // will transition on comma/brace
156 }
157 self.prev_byte = c1;
158 return;
159 }
160 // Regular string character — hash into key hash if in key
161 if self.in_key {
162 self.key_hash = self.key_hash.wrapping_mul(0x01000193) ^ c1 as u32;
163 }
164 self.prev_byte = c1;
165 return;
166 }
167
168 // Not in string — track structural characters
169 match c1 {
170 b'{' => {
171 self.state = JsonState::ObjectKey;
172 self.depth = self.depth.saturating_add(1);
173 }
174 b'[' => {
175 self.state = JsonState::ArrayValue;
176 self.depth = self.depth.saturating_add(1);
177 }
178 b'}' | b']' => {
179 self.depth = self.depth.saturating_sub(1);
180 // Pop back to parent context
181 self.state = if self.depth > 0 {
182 JsonState::ObjectValue // could be either, but ObjectValue is safe
183 } else {
184 JsonState::TopLevel
185 };
186 }
187 b'"' => {
188 // Starting a string
189 self.state = JsonState::String;
190 // Determine if this is a key or value
191 // Key if: after '{', after ',', or if prev non-ws was '{' or ','
192 self.in_key = matches!(self.prev_significant_context(), b'{' | b',');
193 if self.in_key {
194 self.key_hash = 0; // reset for new key
195 }
196 }
197 b':' => {
198 self.state = JsonState::ObjectValue;
199 }
200 b',' => {
201 // After comma, context depends on container
202 // Could be in object (next key) or array (next value)
203 // We'll set ObjectKey and let the quote detection fix it
204 self.state = JsonState::ObjectKey;
205 }
206 b'0'..=b'9' | b'-' => {
207 if self.state != JsonState::Number {
208 self.state = JsonState::Number;
209 }
210 }
211 b't' | b'f' | b'n' => {
212 if self.state != JsonState::Keyword && self.state != JsonState::String {
213 self.state = JsonState::Keyword;
214 }
215 }
216 _ => {
217 // Whitespace or other — don't change state
218 }
219 }
220
221 self.prev_byte = c1;
222 }
223
224 /// Get the previous significant (non-whitespace) byte context.
225 /// Simplified: just return prev_byte since we don't store history.
226 #[inline]
227 fn prev_significant_context(&self) -> u8 {
228 // Skip whitespace in prev_byte
229 if self.prev_byte.is_ascii_whitespace() {
230 // Can't look further back, assume comma context
231 b','
232 } else {
233 self.prev_byte
234 }
235 }
236}
237
238impl Default for JsonModel {
239 fn default() -> Self {
240 Self::new()
241 }
242}
243
244#[cfg(test)]
245mod tests {
246 use super::*;
247
248 #[test]
249 fn initial_prediction_balanced() {
250 let mut jm = JsonModel::new();
251 let p = jm.predict(1, 0, 0);
252 assert_eq!(p, 2048);
253 }
254
255 #[test]
256 fn predictions_in_range() {
257 let mut jm = JsonModel::new();
258 for c in b"{\"name\":\"Alice\",\"age\":30}" {
259 for bpos in 0..8u8 {
260 let bit = (c >> (7 - bpos)) & 1;
261 let p = jm.predict(1, bpos, if bpos == 0 { *c } else { 0 });
262 assert!((1..=4095).contains(&p));
263 jm.update(bit);
264 }
265 }
266 }
267
268 #[test]
269 fn json_state_changes() {
270 let mut jm = JsonModel::new();
271 // Feed opening brace
272 jm.predict(1, 0, b'{');
273 assert_ne!(jm.state, JsonState::TopLevel);
274 }
275}