datacortex_core/format/
json.rs1use super::transform::TransformResult;
12use std::collections::HashMap;
13
14const ESCAPE: u8 = 0x00;
15const ESCAPE_EXTENDED: u8 = 0xFD;
16const ESCAPE_LITERAL: u8 = 0xFE;
17const MAX_SHORT_INDEX: u8 = 0xFC;
18
19struct KeyOccurrence {
21 start: usize, end: usize, content: Vec<u8>,
24}
25
26fn find_keys(data: &[u8]) -> Vec<KeyOccurrence> {
29 let mut keys = Vec::new();
30 let mut pos = 0;
31
32 while pos < data.len() {
33 if data[pos] == b'"' {
34 let start = pos;
35 pos += 1;
36 let mut content = Vec::new();
37 let mut escaped = false;
38
39 while pos < data.len() {
40 if escaped {
41 content.push(data[pos]);
42 escaped = false;
43 } else if data[pos] == b'\\' {
44 content.push(data[pos]);
45 escaped = true;
46 } else if data[pos] == b'"' {
47 pos += 1;
48 break;
49 } else {
50 content.push(data[pos]);
51 }
52 pos += 1;
53 }
54
55 let end = pos;
56
57 let mut check = pos;
59 while check < data.len() && data[check].is_ascii_whitespace() {
60 check += 1;
61 }
62 if check < data.len() && data[check] == b':' {
63 keys.push(KeyOccurrence {
64 start,
65 end,
66 content,
67 });
68 }
69 } else {
70 pos += 1;
71 }
72 }
73
74 keys
75}
76
77fn build_dictionary(keys: &[KeyOccurrence]) -> Vec<Vec<u8>> {
79 let mut freq: HashMap<Vec<u8>, usize> = HashMap::new();
80 for k in keys {
81 *freq.entry(k.content.clone()).or_default() += 1;
82 }
83
84 let mut entries: Vec<(Vec<u8>, usize)> =
85 freq.into_iter().filter(|(_, count)| *count > 1).collect();
86
87 entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
89
90 entries.into_iter().map(|(k, _)| k).collect()
91}
92
93pub fn preprocess(data: &[u8]) -> Option<TransformResult> {
95 let keys = find_keys(data);
96 if keys.is_empty() {
97 return None;
98 }
99
100 let dict = build_dictionary(&keys);
101 if dict.is_empty() {
102 return None;
103 }
104
105 let lookup: HashMap<&[u8], usize> = dict
106 .iter()
107 .enumerate()
108 .map(|(i, k)| (k.as_slice(), i))
109 .collect();
110
111 let mut output = Vec::with_capacity(data.len());
112 let mut last_end = 0;
113
114 for key in &keys {
115 if let Some(&idx) = lookup.get(key.content.as_slice()) {
116 escape_copy(&data[last_end..key.start], &mut output);
118
119 output.push(ESCAPE);
121 if idx <= MAX_SHORT_INDEX as usize {
122 output.push(idx as u8);
123 } else {
124 output.push(ESCAPE_EXTENDED);
125 output.extend_from_slice(&(idx as u16).to_le_bytes());
126 }
127
128 last_end = key.end;
129 } else {
130 escape_copy(&data[last_end..key.end], &mut output);
132 last_end = key.end;
133 }
134 }
135
136 escape_copy(&data[last_end..], &mut output);
138
139 let metadata = serialize_dict(&dict);
141 if output.len() >= data.len() {
142 return None;
143 }
144
145 Some(TransformResult {
146 data: output,
147 metadata,
148 })
149}
150
151pub fn reverse(data: &[u8], metadata: &[u8]) -> Vec<u8> {
153 let dict = deserialize_dict(metadata);
154 let mut output = Vec::with_capacity(data.len() * 2);
155 let mut pos = 0;
156
157 while pos < data.len() {
158 if data[pos] == ESCAPE {
159 pos += 1;
160 if pos >= data.len() {
161 break;
162 }
163 match data[pos] {
164 ESCAPE_LITERAL => {
165 output.push(ESCAPE);
166 pos += 1;
167 }
168 ESCAPE_EXTENDED => {
169 pos += 1;
170 if pos + 2 <= data.len() {
171 let idx = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
172 pos += 2;
173 if idx < dict.len() {
174 output.push(b'"');
175 output.extend_from_slice(&dict[idx]);
176 output.push(b'"');
177 }
178 }
179 }
180 idx if idx <= MAX_SHORT_INDEX => {
181 let idx = idx as usize;
182 if idx < dict.len() {
183 output.push(b'"');
184 output.extend_from_slice(&dict[idx]);
185 output.push(b'"');
186 }
187 pos += 1;
188 }
189 _ => {
190 pos += 1;
192 }
193 }
194 } else {
195 output.push(data[pos]);
196 pos += 1;
197 }
198 }
199
200 output
201}
202
203fn escape_copy(src: &[u8], dst: &mut Vec<u8>) {
205 for &b in src {
206 if b == ESCAPE {
207 dst.push(ESCAPE);
208 dst.push(ESCAPE_LITERAL);
209 } else {
210 dst.push(b);
211 }
212 }
213}
214
215fn serialize_dict(dict: &[Vec<u8>]) -> Vec<u8> {
216 let mut out = Vec::new();
217 out.push(1); out.extend_from_slice(&(dict.len() as u16).to_le_bytes());
219 for key in dict {
220 out.extend_from_slice(&(key.len() as u16).to_le_bytes());
221 out.extend_from_slice(key);
222 }
223 out
224}
225
226fn deserialize_dict(data: &[u8]) -> Vec<Vec<u8>> {
227 if data.len() < 3 {
228 return vec![];
229 }
230 let mut pos = 0;
231 let _version = data[pos];
232 pos += 1;
233 let num = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
234 pos += 2;
235 let mut dict = Vec::with_capacity(num);
236 for _ in 0..num {
237 if pos + 2 > data.len() {
238 break;
239 }
240 let len = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
241 pos += 2;
242 if pos + len > data.len() {
243 break;
244 }
245 dict.push(data[pos..pos + len].to_vec());
246 pos += len;
247 }
248 dict
249}
250
251#[cfg(test)]
252mod tests {
253 use super::*;
254
255 #[test]
256 fn find_keys_simple() {
257 let data = br#"{"name": "Alice", "age": 30}"#;
258 let keys = find_keys(data);
259 assert_eq!(keys.len(), 2);
260 assert_eq!(keys[0].content, b"name");
261 assert_eq!(keys[1].content, b"age");
262 }
263
264 #[test]
265 fn find_keys_nested() {
266 let data = br#"{"a": {"b": 1, "c": 2}, "a": {"b": 3}}"#;
267 let keys = find_keys(data);
268 assert_eq!(keys.len(), 5);
270 }
271
272 #[test]
273 fn find_keys_escaped_quotes() {
274 let data = br#"{"key\"name": "val"}"#;
275 let keys = find_keys(data);
276 assert_eq!(keys.len(), 1);
277 assert_eq!(keys[0].content, br#"key\"name"#.to_vec());
278 }
279
280 #[test]
281 fn roundtrip_simple() {
282 let data = br#"{"name": "Alice", "age": 30, "name": "Bob", "age": 25}"#;
283 let result = preprocess(data).expect("should produce transform");
284 let restored = reverse(&result.data, &result.metadata);
285 assert_eq!(restored, data.to_vec());
286 }
287
288 #[test]
289 fn roundtrip_nested() {
290 let data = br#"{"id": 1, "data": {"id": 2, "type": "x"}, "id": 3, "type": "y"}"#;
291 let result = preprocess(data).expect("should produce transform");
292 let restored = reverse(&result.data, &result.metadata);
293 assert_eq!(restored, data.to_vec());
294 }
295
296 #[test]
297 fn roundtrip_ndjson_lines() {
298 let data = br#"{"ts":"a","val":1}
299{"ts":"b","val":2}
300{"ts":"c","val":3}
301"#;
302 let result = preprocess(data).expect("should produce transform");
303 let restored = reverse(&result.data, &result.metadata);
304 assert_eq!(restored, data.to_vec());
305 }
306
307 #[test]
308 fn no_transform_unique_keys() {
309 let data = br#"{"a": 1, "b": 2, "c": 3}"#;
310 assert!(
311 preprocess(data).is_none(),
312 "unique keys should not be interned"
313 );
314 }
315
316 #[test]
317 fn dict_roundtrip() {
318 let dict = vec![b"name".to_vec(), b"age".to_vec(), b"city".to_vec()];
319 let serialized = serialize_dict(&dict);
320 let deserialized = deserialize_dict(&serialized);
321 assert_eq!(deserialized, dict);
322 }
323
324 #[test]
325 fn size_reduction() {
326 let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
327 let result = preprocess(data).expect("should produce transform");
328 assert!(
330 result.data.len() + result.metadata.len() < data.len(),
331 "interned={} + meta={} should be < original={}",
332 result.data.len(),
333 result.metadata.len(),
334 data.len()
335 );
336 }
337}