datacortex_core/format/
json.rs1use std::collections::HashMap;
12
13use super::transform::TransformResult;
14
15const ESCAPE: u8 = 0x00;
16const ESCAPE_EXTENDED: u8 = 0xFD;
17const ESCAPE_LITERAL: u8 = 0xFE;
18const MAX_SHORT_INDEX: u8 = 0xFC;
19
20struct KeyOccurrence {
22 start: usize, end: usize, content: Vec<u8>,
25}
26
27fn find_keys(data: &[u8]) -> Vec<KeyOccurrence> {
30 let mut keys = Vec::new();
31 let mut pos = 0;
32
33 while pos < data.len() {
34 if data[pos] == b'"' {
35 let start = pos;
36 pos += 1;
37 let mut content = Vec::new();
38 let mut escaped = false;
39
40 while pos < data.len() {
41 if escaped {
42 content.push(data[pos]);
43 escaped = false;
44 } else if data[pos] == b'\\' {
45 content.push(data[pos]);
46 escaped = true;
47 } else if data[pos] == b'"' {
48 pos += 1;
49 break;
50 } else {
51 content.push(data[pos]);
52 }
53 pos += 1;
54 }
55
56 let end = pos;
57
58 let mut check = pos;
60 while check < data.len() && data[check].is_ascii_whitespace() {
61 check += 1;
62 }
63 if check < data.len() && data[check] == b':' {
64 keys.push(KeyOccurrence {
65 start,
66 end,
67 content,
68 });
69 }
70 } else {
71 pos += 1;
72 }
73 }
74
75 keys
76}
77
78fn build_dictionary(keys: &[KeyOccurrence]) -> Vec<Vec<u8>> {
80 let mut freq: HashMap<Vec<u8>, usize> = HashMap::new();
81 for k in keys {
82 *freq.entry(k.content.clone()).or_default() += 1;
83 }
84
85 let mut entries: Vec<(Vec<u8>, usize)> =
86 freq.into_iter().filter(|(_, count)| *count > 1).collect();
87
88 entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
90
91 entries.into_iter().map(|(k, _)| k).collect()
92}
93
94pub fn preprocess(data: &[u8]) -> Option<TransformResult> {
96 let keys = find_keys(data);
97 if keys.is_empty() {
98 return None;
99 }
100
101 let dict = build_dictionary(&keys);
102 if dict.is_empty() {
103 return None;
104 }
105
106 let lookup: HashMap<&[u8], usize> = dict
107 .iter()
108 .enumerate()
109 .map(|(i, k)| (k.as_slice(), i))
110 .collect();
111
112 let mut output = Vec::with_capacity(data.len());
113 let mut last_end = 0;
114
115 for key in &keys {
116 if let Some(&idx) = lookup.get(key.content.as_slice()) {
117 escape_copy(&data[last_end..key.start], &mut output);
119
120 output.push(ESCAPE);
122 if idx <= MAX_SHORT_INDEX as usize {
123 output.push(idx as u8);
124 } else {
125 output.push(ESCAPE_EXTENDED);
126 output.extend_from_slice(&(idx as u16).to_le_bytes());
127 }
128
129 last_end = key.end;
130 } else {
131 escape_copy(&data[last_end..key.end], &mut output);
133 last_end = key.end;
134 }
135 }
136
137 escape_copy(&data[last_end..], &mut output);
139
140 let metadata = serialize_dict(&dict);
142 if output.len() >= data.len() {
143 return None;
144 }
145
146 Some(TransformResult {
147 data: output,
148 metadata,
149 })
150}
151
152pub fn reverse(data: &[u8], metadata: &[u8]) -> Vec<u8> {
154 let dict = deserialize_dict(metadata);
155 let mut output = Vec::with_capacity(data.len() * 2);
156 let mut pos = 0;
157
158 while pos < data.len() {
159 if data[pos] == ESCAPE {
160 pos += 1;
161 if pos >= data.len() {
162 break;
163 }
164 match data[pos] {
165 ESCAPE_LITERAL => {
166 output.push(ESCAPE);
167 pos += 1;
168 }
169 ESCAPE_EXTENDED => {
170 pos += 1;
171 if pos + 2 <= data.len() {
172 let idx = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
173 pos += 2;
174 if idx < dict.len() {
175 output.push(b'"');
176 output.extend_from_slice(&dict[idx]);
177 output.push(b'"');
178 }
179 }
180 }
181 idx if idx <= MAX_SHORT_INDEX => {
182 let idx = idx as usize;
183 if idx < dict.len() {
184 output.push(b'"');
185 output.extend_from_slice(&dict[idx]);
186 output.push(b'"');
187 }
188 pos += 1;
189 }
190 _ => {
191 pos += 1;
193 }
194 }
195 } else {
196 output.push(data[pos]);
197 pos += 1;
198 }
199 }
200
201 output
202}
203
204fn escape_copy(src: &[u8], dst: &mut Vec<u8>) {
206 for &b in src {
207 if b == ESCAPE {
208 dst.push(ESCAPE);
209 dst.push(ESCAPE_LITERAL);
210 } else {
211 dst.push(b);
212 }
213 }
214}
215
216fn serialize_dict(dict: &[Vec<u8>]) -> Vec<u8> {
217 let mut out = Vec::new();
218 out.push(1); out.extend_from_slice(&(dict.len() as u16).to_le_bytes());
220 for key in dict {
221 out.extend_from_slice(&(key.len() as u16).to_le_bytes());
222 out.extend_from_slice(key);
223 }
224 out
225}
226
227fn deserialize_dict(data: &[u8]) -> Vec<Vec<u8>> {
228 if data.len() < 3 {
229 return vec![];
230 }
231 let mut pos = 0;
232 pos += 1;
234 let num = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
235 pos += 2;
236 let mut dict = Vec::with_capacity(num);
237 for _ in 0..num {
238 if pos + 2 > data.len() {
239 break;
240 }
241 let len = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
242 pos += 2;
243 if pos + len > data.len() {
244 break;
245 }
246 dict.push(data[pos..pos + len].to_vec());
247 pos += len;
248 }
249 dict
250}
251
252#[cfg(test)]
253mod tests {
254 use super::*;
255
256 #[test]
257 fn find_keys_simple() {
258 let data = br#"{"name": "Alice", "age": 30}"#;
259 let keys = find_keys(data);
260 assert_eq!(keys.len(), 2);
261 assert_eq!(keys[0].content, b"name");
262 assert_eq!(keys[1].content, b"age");
263 }
264
265 #[test]
266 fn find_keys_nested() {
267 let data = br#"{"a": {"b": 1, "c": 2}, "a": {"b": 3}}"#;
268 let keys = find_keys(data);
269 assert_eq!(keys.len(), 5);
271 }
272
273 #[test]
274 fn find_keys_escaped_quotes() {
275 let data = br#"{"key\"name": "val"}"#;
276 let keys = find_keys(data);
277 assert_eq!(keys.len(), 1);
278 assert_eq!(keys[0].content, br#"key\"name"#.to_vec());
279 }
280
281 #[test]
282 fn roundtrip_simple() {
283 let data = br#"{"name": "Alice", "age": 30, "name": "Bob", "age": 25}"#;
284 let result = preprocess(data).expect("should produce transform");
285 let restored = reverse(&result.data, &result.metadata);
286 assert_eq!(restored, data.to_vec());
287 }
288
289 #[test]
290 fn roundtrip_nested() {
291 let data = br#"{"id": 1, "data": {"id": 2, "type": "x"}, "id": 3, "type": "y"}"#;
292 let result = preprocess(data).expect("should produce transform");
293 let restored = reverse(&result.data, &result.metadata);
294 assert_eq!(restored, data.to_vec());
295 }
296
297 #[test]
298 fn roundtrip_ndjson_lines() {
299 let data = br#"{"ts":"a","val":1}
300{"ts":"b","val":2}
301{"ts":"c","val":3}
302"#;
303 let result = preprocess(data).expect("should produce transform");
304 let restored = reverse(&result.data, &result.metadata);
305 assert_eq!(restored, data.to_vec());
306 }
307
308 #[test]
309 fn no_transform_unique_keys() {
310 let data = br#"{"a": 1, "b": 2, "c": 3}"#;
311 assert!(
312 preprocess(data).is_none(),
313 "unique keys should not be interned"
314 );
315 }
316
317 #[test]
318 fn dict_roundtrip() {
319 let dict = vec![b"name".to_vec(), b"age".to_vec(), b"city".to_vec()];
320 let serialized = serialize_dict(&dict);
321 let deserialized = deserialize_dict(&serialized);
322 assert_eq!(deserialized, dict);
323 }
324
325 #[test]
326 fn size_reduction() {
327 let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
328 let result = preprocess(data).expect("should produce transform");
329 assert!(
331 result.data.len() + result.metadata.len() < data.len(),
332 "interned={} + meta={} should be < original={}",
333 result.data.len(),
334 result.metadata.len(),
335 data.len()
336 );
337 }
338}