base_d/encoders/algorithms/schema/parsers/
json.rs1use crate::encoders::algorithms::schema::parsers::InputParser;
2use crate::encoders::algorithms::schema::types::*;
3use serde_json::{Map, Value};
4use std::collections::HashMap;
5
6pub struct JsonParser;
7
8impl InputParser for JsonParser {
9 type Error = SchemaError;
10
11 fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
12 let parsed: Value = serde_json::from_str(input).map_err(|e| {
13 SchemaError::InvalidInput(format!(
14 "Invalid JSON syntax: {}\n\
15 Ensure the input is valid JSON.",
16 e
17 ))
18 })?;
19
20 match parsed {
21 Value::Array(arr) => parse_array(arr),
22 Value::Object(obj) => parse_object(obj),
23 _ => Err(SchemaError::InvalidInput(
24 "Expected JSON object or array at root level.\n\
25 Schema encoding works with:\n\
26 - Single object: {\"name\": \"value\"}\n\
27 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
28 - Object with array: {\"users\": [{\"id\": 1}]}"
29 .to_string(),
30 )),
31 }
32 }
33}
34
35fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
37 if arr.is_empty() {
38 return Err(SchemaError::InvalidInput(
39 "Empty array - cannot infer schema from zero rows.\n\
40 Provide at least one object in the array."
41 .to_string(),
42 ));
43 }
44
45 let row_count = arr.len();
46 let mut all_rows: Vec<Map<String, Value>> = Vec::new();
47
48 for (idx, item) in arr.into_iter().enumerate() {
50 match item {
51 Value::Object(obj) => all_rows.push(obj),
52 other => {
53 let type_name = match other {
54 Value::Null => "null",
55 Value::Bool(_) => "boolean",
56 Value::Number(_) => "number",
57 Value::String(_) => "string",
58 Value::Array(_) => "array",
59 Value::Object(_) => unreachable!(),
60 };
61 return Err(SchemaError::InvalidInput(format!(
62 "Array must contain only objects (tabular data). Found {} at index {}.\n\
63 Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
64 type_name, idx
65 )));
66 }
67 }
68 }
69
70 let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
72 let mut all_field_names = std::collections::BTreeSet::new();
73
74 for obj in &all_rows {
75 let flattened = flatten_object(obj, "");
76 for key in flattened.keys() {
77 all_field_names.insert(key.clone());
78 }
79 flattened_rows.push(flattened);
80 }
81
82 let field_names: Vec<String> = all_field_names.into_iter().collect();
83
84 let mut fields = Vec::new();
86 let mut has_nulls = false;
87
88 for field_name in &field_names {
89 let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
90 fields.push(FieldDef::new(field_name.clone(), field_type));
91 }
92
93 let mut values = Vec::new();
95 let total_values = row_count * fields.len();
96 let bitmap_bytes = total_values.div_ceil(8);
97 let mut null_bitmap = vec![0u8; bitmap_bytes];
98
99 for (row_idx, row) in flattened_rows.iter().enumerate() {
100 for (field_idx, field) in fields.iter().enumerate() {
101 let value_idx = row_idx * fields.len() + field_idx;
102
103 if let Some(json_value) = row.get(&field.name)
104 && json_value.is_null()
105 {
106 values.push(SchemaValue::Null);
107 set_null_bit(&mut null_bitmap, value_idx);
108 has_nulls = true;
109 } else if let Some(json_value) = row.get(&field.name) {
110 values.push(json_to_schema_value(json_value, &field.field_type)?);
111 } else {
112 values.push(SchemaValue::Null);
114 set_null_bit(&mut null_bitmap, value_idx);
115 has_nulls = true;
116 }
117 }
118 }
119
120 let mut header = SchemaHeader::new(row_count, fields);
122 if has_nulls {
123 header.null_bitmap = Some(null_bitmap);
124 header.set_flag(FLAG_HAS_NULLS);
125 }
126
127 IntermediateRepresentation::new(header, values)
128}
129
130fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
132 if obj.len() == 1 {
135 let is_root_key_pattern = obj
137 .values()
138 .next()
139 .map(|v| {
140 if let Value::Array(arr) = v {
141 !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
143 } else {
144 false
145 }
146 })
147 .unwrap_or(false);
148
149 if is_root_key_pattern {
150 let (key, value) = obj.into_iter().next().unwrap();
152 let arr = match value {
154 Value::Array(a) => a,
155 _ => unreachable!(),
156 };
157
158 let mut ir = parse_array(arr)?;
160 ir.header.root_key = Some(key);
161 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
162 return Ok(ir);
163 }
164 }
165
166 let flattened = flatten_object(&obj, "");
168 let mut field_names = Vec::new();
170 collect_field_names_ordered(&obj, "", &mut field_names);
171
172 let mut fields = Vec::new();
173 let mut has_nulls = false;
174
175 for field_name in &field_names {
176 let value = &flattened[field_name];
177 let field_type = infer_type(value);
178 if value.is_null() {
179 has_nulls = true;
180 }
181 fields.push(FieldDef::new(field_name.clone(), field_type));
182 }
183
184 let mut values = Vec::new();
186 let total_values = fields.len();
187 let bitmap_bytes = total_values.div_ceil(8);
188 let mut null_bitmap = vec![0u8; bitmap_bytes];
189
190 for (field_idx, field) in fields.iter().enumerate() {
191 let json_value = &flattened[&field.name];
192 if json_value.is_null() {
193 values.push(SchemaValue::Null);
194 set_null_bit(&mut null_bitmap, field_idx);
195 } else {
196 values.push(json_to_schema_value(json_value, &field.field_type)?);
197 }
198 }
199
200 let mut header = SchemaHeader::new(1, fields);
202 if has_nulls {
203 header.null_bitmap = Some(null_bitmap);
204 header.set_flag(FLAG_HAS_NULLS);
205 }
206
207 IntermediateRepresentation::new(header, values)
208}
209
210fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
212 for (key, value) in obj {
213 let full_key = if prefix.is_empty() {
214 key.clone()
215 } else {
216 format!("{}.{}", prefix, key)
217 };
218
219 match value {
220 Value::Object(nested) => {
221 collect_field_names_ordered(nested, &full_key, names);
222 }
223 _ => {
224 names.push(full_key);
225 }
226 }
227 }
228}
229
230fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
232 let mut result = HashMap::new();
233
234 for (key, value) in obj {
235 let full_key = if prefix.is_empty() {
236 key.clone()
237 } else {
238 format!("{}.{}", prefix, key)
239 };
240
241 match value {
242 Value::Object(nested) => {
243 result.extend(flatten_object(nested, &full_key));
244 }
245 _ => {
246 result.insert(full_key, value.clone());
247 }
248 }
249 }
250
251 result
252}
253
254fn infer_type(value: &Value) -> FieldType {
256 match value {
257 Value::Null => FieldType::Null,
258 Value::Bool(_) => FieldType::Bool,
259 Value::Number(n) => {
260 if n.is_f64() {
261 if let Some(f) = n.as_f64()
263 && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
264 {
265 return FieldType::F64;
266 }
267 }
268
269 if let Some(i) = n.as_i64() {
270 if i < 0 {
271 FieldType::I64
272 } else {
273 FieldType::U64
274 }
275 } else if n.as_u64().is_some() {
276 FieldType::U64
277 } else {
278 FieldType::F64
279 }
280 }
281 Value::String(_) => FieldType::String,
282 Value::Array(arr) => {
283 if arr.is_empty() {
284 FieldType::Array(Box::new(FieldType::Null))
285 } else {
286 let element_type = arr
288 .iter()
289 .find(|v| !v.is_null())
290 .map(infer_type)
291 .unwrap_or(FieldType::Null);
292 FieldType::Array(Box::new(element_type))
293 }
294 }
295 Value::Object(_) => {
296 FieldType::String
298 }
299 }
300}
301
302fn infer_field_type(
304 rows: &[HashMap<String, Value>],
305 field_name: &str,
306 has_nulls: &mut bool,
307) -> Result<FieldType, SchemaError> {
308 let mut inferred_type: Option<FieldType> = None;
309
310 for row in rows {
311 if let Some(value) = row.get(field_name) {
312 if value.is_null() {
313 *has_nulls = true;
314 continue;
315 }
316
317 let current_type = infer_type(value);
318
319 if let Some(ref existing_type) = inferred_type {
320 if *existing_type != current_type {
321 return Ok(FieldType::Any);
323 }
324 } else {
325 inferred_type = Some(current_type);
326 }
327 } else {
328 *has_nulls = true;
329 }
330 }
331
332 Ok(inferred_type.unwrap_or(FieldType::Null))
333}
334
335fn json_to_schema_value(
337 value: &Value,
338 expected_type: &FieldType,
339) -> Result<SchemaValue, SchemaError> {
340 match value {
341 Value::Null => Ok(SchemaValue::Null),
342 Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
343 Value::Number(n) => match expected_type {
344 FieldType::U64 | FieldType::Any => {
345 if let Some(u) = n.as_u64() {
346 Ok(SchemaValue::U64(u))
347 } else if let Some(i) = n.as_i64() {
348 Ok(SchemaValue::I64(i))
349 } else {
350 Ok(SchemaValue::F64(n.as_f64().unwrap()))
351 }
352 }
353 FieldType::I64 => {
354 if let Some(i) = n.as_i64() {
355 Ok(SchemaValue::I64(i))
356 } else {
357 Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
358 }
359 }
360 FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
361 _ => Err(SchemaError::InvalidInput(format!(
362 "Type mismatch: expected {}, but found number.\n\
363 The field type was inferred or specified as {}, which doesn't accept numeric values.",
364 expected_type.display_name(),
365 expected_type.display_name()
366 ))),
367 },
368 Value::String(s) => Ok(SchemaValue::String(s.clone())),
369 Value::Array(arr) => {
370 let element_type = if let FieldType::Array(et) = expected_type {
371 et.as_ref()
372 } else {
373 return Err(SchemaError::InvalidInput(format!(
374 "Internal error: Expected array type but found {}. This is a bug in type inference.",
375 expected_type.display_name()
376 )));
377 };
378
379 let mut schema_values = Vec::new();
380 for item in arr {
381 schema_values.push(json_to_schema_value(item, element_type)?);
382 }
383 Ok(SchemaValue::Array(schema_values))
384 }
385 Value::Object(_) => Err(SchemaError::InvalidInput(
386 "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
387 .to_string(),
388 )),
389 }
390}
391
392fn set_null_bit(bitmap: &mut [u8], index: usize) {
394 let byte_idx = index / 8;
395 let bit_idx = index % 8;
396 bitmap[byte_idx] |= 1 << bit_idx;
397}
398
399#[cfg(test)]
400mod tests {
401 use super::*;
402
403 #[test]
404 fn test_simple_object() {
405 let input = r#"{"id":1,"name":"alice"}"#;
406 let ir = JsonParser::parse(input).unwrap();
407
408 assert_eq!(ir.header.row_count, 1);
409 assert_eq!(ir.header.fields.len(), 2);
410 assert_eq!(ir.values.len(), 2);
411 }
412
413 #[test]
414 fn test_array_of_objects() {
415 let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
416 let ir = JsonParser::parse(input).unwrap();
417
418 assert_eq!(ir.header.row_count, 2);
419 assert_eq!(ir.header.fields.len(), 2);
420 assert_eq!(ir.values.len(), 4);
421 }
422
423 #[test]
424 fn test_nested_object() {
425 let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
426 let ir = JsonParser::parse(input).unwrap();
427
428 assert_eq!(ir.header.row_count, 1);
429 assert_eq!(ir.header.fields.len(), 1);
430 assert_eq!(ir.header.fields[0].name, "user.profile.name");
431 }
432
433 #[test]
434 fn test_root_key() {
435 let input = r#"{"users":[{"id":1}]}"#;
436 let ir = JsonParser::parse(input).unwrap();
437
438 assert_eq!(ir.header.root_key, Some("users".to_string()));
439 assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
440 }
441
442 #[test]
443 fn test_all_types() {
444 let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
445 let ir = JsonParser::parse(input).unwrap();
446
447 assert_eq!(ir.header.fields.len(), 6);
448 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
449 }
450
451 #[test]
452 fn test_null_handling() {
453 let input = r#"{"name":"alice","age":null}"#;
454 let ir = JsonParser::parse(input).unwrap();
455
456 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
457
458 let age_idx = ir
460 .header
461 .fields
462 .iter()
463 .position(|f| f.name == "age")
464 .unwrap();
465 assert!(ir.is_null(0, age_idx)); }
467
468 #[test]
469 fn test_homogeneous_array() {
470 let input = r#"{"scores":[1,2,3]}"#;
471 let ir = JsonParser::parse(input).unwrap();
472
473 assert_eq!(
474 ir.header.fields[0].field_type,
475 FieldType::Array(Box::new(FieldType::U64))
476 );
477 }
478
479 #[test]
480 fn test_empty_array() {
481 let input = r#"{"items":[]}"#;
482 let ir = JsonParser::parse(input).unwrap();
483
484 assert_eq!(
485 ir.header.fields[0].field_type,
486 FieldType::Array(Box::new(FieldType::Null))
487 );
488 }
489
490 #[test]
491 fn test_deep_nesting() {
492 let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
493 let ir = JsonParser::parse(input).unwrap();
494
495 assert_eq!(ir.header.fields[0].name, "a.b.c.d");
496 }
497
498 #[test]
499 fn test_flatten_object() {
500 let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
501 let flattened = flatten_object(&obj, "");
502
503 assert_eq!(flattened.len(), 1);
504 assert!(flattened.contains_key("a.b"));
505 }
506}