base_d/encoders/algorithms/schema/parsers/
json.rs1use crate::encoders::algorithms::schema::fiche::NEST_SEP;
2use crate::encoders::algorithms::schema::parsers::InputParser;
3use crate::encoders::algorithms::schema::types::*;
4use serde_json::{Map, Value};
5use std::collections::HashMap;
6
7pub struct JsonParser;
8
9impl InputParser for JsonParser {
10 type Error = SchemaError;
11
12 fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
13 let parsed: Value = serde_json::from_str(input).map_err(|e| {
14 SchemaError::InvalidInput(format!(
15 "Invalid JSON syntax: {}\n\
16 Ensure the input is valid JSON.",
17 e
18 ))
19 })?;
20
21 match parsed {
22 Value::Array(arr) => parse_array(arr),
23 Value::Object(obj) => parse_object(obj),
24 _ => Err(SchemaError::InvalidInput(
25 "Expected JSON object or array at root level.\n\
26 Schema encoding works with:\n\
27 - Single object: {\"name\": \"value\"}\n\
28 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
29 - Object with array: {\"users\": [{\"id\": 1}]}"
30 .to_string(),
31 )),
32 }
33 }
34}
35
36fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
38 if arr.is_empty() {
39 return Err(SchemaError::InvalidInput(
40 "Empty array - cannot infer schema from zero rows.\n\
41 Provide at least one object in the array."
42 .to_string(),
43 ));
44 }
45
46 let row_count = arr.len();
47 let mut all_rows: Vec<Map<String, Value>> = Vec::new();
48
49 for (idx, item) in arr.into_iter().enumerate() {
51 match item {
52 Value::Object(obj) => all_rows.push(obj),
53 other => {
54 let type_name = match other {
55 Value::Null => "null",
56 Value::Bool(_) => "boolean",
57 Value::Number(_) => "number",
58 Value::String(_) => "string",
59 Value::Array(_) => "array",
60 Value::Object(_) => unreachable!(),
61 };
62 return Err(SchemaError::InvalidInput(format!(
63 "Array must contain only objects (tabular data). Found {} at index {}.\n\
64 Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
65 type_name, idx
66 )));
67 }
68 }
69 }
70
71 let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
73 let mut all_field_names = std::collections::BTreeSet::new();
74
75 for obj in &all_rows {
76 let flattened = flatten_object(obj, "");
77 for key in flattened.keys() {
78 all_field_names.insert(key.clone());
79 }
80 flattened_rows.push(flattened);
81 }
82
83 let field_names: Vec<String> = all_field_names.into_iter().collect();
84
85 let mut fields = Vec::new();
87 let mut has_nulls = false;
88
89 for field_name in &field_names {
90 let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
91 fields.push(FieldDef::new(field_name.clone(), field_type));
92 }
93
94 let mut values = Vec::new();
96 let total_values = row_count * fields.len();
97 let bitmap_bytes = total_values.div_ceil(8);
98 let mut null_bitmap = vec![0u8; bitmap_bytes];
99
100 for (row_idx, row) in flattened_rows.iter().enumerate() {
101 for (field_idx, field) in fields.iter().enumerate() {
102 let value_idx = row_idx * fields.len() + field_idx;
103
104 if let Some(json_value) = row.get(&field.name)
105 && json_value.is_null()
106 {
107 values.push(SchemaValue::Null);
108 set_null_bit(&mut null_bitmap, value_idx);
109 has_nulls = true;
110 } else if let Some(json_value) = row.get(&field.name) {
111 values.push(json_to_schema_value(json_value, &field.field_type)?);
112 } else {
113 values.push(SchemaValue::Null);
115 set_null_bit(&mut null_bitmap, value_idx);
116 has_nulls = true;
117 }
118 }
119 }
120
121 let mut header = SchemaHeader::new(row_count, fields);
123 if has_nulls {
124 header.null_bitmap = Some(null_bitmap);
125 header.set_flag(FLAG_HAS_NULLS);
126 }
127
128 IntermediateRepresentation::new(header, values)
129}
130
131fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
133 const WRAPPER_KEYS: &[&str] = &["results", "data", "items", "records"];
135
136 if obj.len() == 1 {
138 let is_root_key_pattern = obj
140 .values()
141 .next()
142 .map(|v| {
143 if let Value::Array(arr) = v {
144 !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
146 } else {
147 false
148 }
149 })
150 .unwrap_or(false);
151
152 if is_root_key_pattern {
153 let (key, value) = obj.into_iter().next().unwrap();
155 let arr = match value {
157 Value::Array(a) => a,
158 _ => unreachable!(),
159 };
160
161 let mut ir = parse_array(arr)?;
163 ir.header.root_key = Some(key);
164 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
165 return Ok(ir);
166 }
167 }
168
169 for wrapper_key in WRAPPER_KEYS {
171 if let Some(Value::Array(arr)) = obj.get(*wrapper_key)
172 && !arr.is_empty()
173 && arr.iter().all(|item| matches!(item, Value::Object(_)))
174 {
175 let arr = arr.clone();
177 let mut ir = parse_array(arr)?;
178 ir.header.root_key = Some((*wrapper_key).to_string());
179 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
180 return Ok(ir);
181 }
182 }
183
184 let flattened = flatten_object(&obj, "");
186 let mut field_names = Vec::new();
188 collect_field_names_ordered(&obj, "", &mut field_names);
189
190 let mut fields = Vec::new();
191 let mut has_nulls = false;
192
193 for field_name in &field_names {
194 let value = &flattened[field_name];
195 let field_type = infer_type(value);
196 if value.is_null() {
197 has_nulls = true;
198 }
199 fields.push(FieldDef::new(field_name.clone(), field_type));
200 }
201
202 let mut values = Vec::new();
204 let total_values = fields.len();
205 let bitmap_bytes = total_values.div_ceil(8);
206 let mut null_bitmap = vec![0u8; bitmap_bytes];
207
208 for (field_idx, field) in fields.iter().enumerate() {
209 let json_value = &flattened[&field.name];
210 if json_value.is_null() {
211 values.push(SchemaValue::Null);
212 set_null_bit(&mut null_bitmap, field_idx);
213 } else {
214 values.push(json_to_schema_value(json_value, &field.field_type)?);
215 }
216 }
217
218 let mut header = SchemaHeader::new(1, fields);
220 if has_nulls {
221 header.null_bitmap = Some(null_bitmap);
222 header.set_flag(FLAG_HAS_NULLS);
223 }
224
225 IntermediateRepresentation::new(header, values)
226}
227
228fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
230 for (key, value) in obj {
231 let full_key = if prefix.is_empty() {
232 key.clone()
233 } else {
234 format!("{}{}{}", prefix, NEST_SEP, key)
235 };
236
237 match value {
238 Value::Object(nested) => {
239 collect_field_names_ordered(nested, &full_key, names);
240 }
241 _ => {
242 names.push(full_key);
243 }
244 }
245 }
246}
247
248fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
250 let mut result = HashMap::new();
251
252 for (key, value) in obj {
253 let full_key = if prefix.is_empty() {
254 key.clone()
255 } else {
256 format!("{}{}{}", prefix, NEST_SEP, key)
257 };
258
259 match value {
260 Value::Object(nested) => {
261 result.extend(flatten_object(nested, &full_key));
262 }
263 _ => {
264 result.insert(full_key, value.clone());
265 }
266 }
267 }
268
269 result
270}
271
272fn infer_type(value: &Value) -> FieldType {
274 match value {
275 Value::Null => FieldType::Null,
276 Value::Bool(_) => FieldType::Bool,
277 Value::Number(n) => {
278 if n.is_f64() {
279 if let Some(f) = n.as_f64()
281 && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
282 {
283 return FieldType::F64;
284 }
285 }
286
287 if let Some(i) = n.as_i64() {
288 if i < 0 {
289 FieldType::I64
290 } else {
291 FieldType::U64
292 }
293 } else if n.as_u64().is_some() {
294 FieldType::U64
295 } else {
296 FieldType::F64
297 }
298 }
299 Value::String(_) => FieldType::String,
300 Value::Array(arr) => {
301 if arr.is_empty() {
302 FieldType::Array(Box::new(FieldType::Null))
303 } else {
304 let element_type = arr
306 .iter()
307 .find(|v| !v.is_null())
308 .map(infer_type)
309 .unwrap_or(FieldType::Null);
310 FieldType::Array(Box::new(element_type))
311 }
312 }
313 Value::Object(_) => {
314 FieldType::String
316 }
317 }
318}
319
320fn infer_field_type(
322 rows: &[HashMap<String, Value>],
323 field_name: &str,
324 has_nulls: &mut bool,
325) -> Result<FieldType, SchemaError> {
326 let mut inferred_type: Option<FieldType> = None;
327
328 for row in rows {
329 if let Some(value) = row.get(field_name) {
330 if value.is_null() {
331 *has_nulls = true;
332 continue;
333 }
334
335 let current_type = infer_type(value);
336
337 if let Some(ref existing_type) = inferred_type {
338 if let (FieldType::Array(existing_inner), FieldType::Array(current_inner)) =
340 (existing_type, ¤t_type)
341 {
342 if **existing_inner == FieldType::Null && **current_inner != FieldType::Null {
343 inferred_type = Some(current_type.clone());
345 continue;
346 } else if **current_inner == FieldType::Null
347 && **existing_inner != FieldType::Null
348 {
349 continue;
351 }
352 }
353
354 if *existing_type != current_type {
355 return Ok(FieldType::Any);
357 }
358 } else {
359 inferred_type = Some(current_type);
360 }
361 } else {
362 *has_nulls = true;
363 }
364 }
365
366 Ok(inferred_type.unwrap_or(FieldType::Null))
367}
368
369fn json_to_schema_value(
371 value: &Value,
372 expected_type: &FieldType,
373) -> Result<SchemaValue, SchemaError> {
374 match value {
375 Value::Null => Ok(SchemaValue::Null),
376 Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
377 Value::Number(n) => match expected_type {
378 FieldType::U64 | FieldType::Any => {
379 if let Some(u) = n.as_u64() {
380 Ok(SchemaValue::U64(u))
381 } else if let Some(i) = n.as_i64() {
382 Ok(SchemaValue::I64(i))
383 } else {
384 Ok(SchemaValue::F64(n.as_f64().unwrap()))
385 }
386 }
387 FieldType::I64 => {
388 if let Some(i) = n.as_i64() {
389 Ok(SchemaValue::I64(i))
390 } else {
391 Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
392 }
393 }
394 FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
395 _ => Err(SchemaError::InvalidInput(format!(
396 "Type mismatch: expected {}, but found number.\n\
397 The field type was inferred or specified as {}, which doesn't accept numeric values.",
398 expected_type.display_name(),
399 expected_type.display_name()
400 ))),
401 },
402 Value::String(s) => Ok(SchemaValue::String(s.clone())),
403 Value::Array(arr) => {
404 let element_type = if let FieldType::Array(et) = expected_type {
405 et.as_ref()
406 } else {
407 return Err(SchemaError::InvalidInput(format!(
408 "Internal error: Expected array type but found {}. This is a bug in type inference.",
409 expected_type.display_name()
410 )));
411 };
412
413 let mut schema_values = Vec::new();
414 for item in arr {
415 schema_values.push(json_to_schema_value(item, element_type)?);
416 }
417 Ok(SchemaValue::Array(schema_values))
418 }
419 Value::Object(_) => Err(SchemaError::InvalidInput(
420 "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
421 .to_string(),
422 )),
423 }
424}
425
426fn set_null_bit(bitmap: &mut [u8], index: usize) {
428 let byte_idx = index / 8;
429 let bit_idx = index % 8;
430 bitmap[byte_idx] |= 1 << bit_idx;
431}
432
433#[cfg(test)]
434mod tests {
435 use super::*;
436
437 #[test]
438 fn test_simple_object() {
439 let input = r#"{"id":1,"name":"alice"}"#;
440 let ir = JsonParser::parse(input).unwrap();
441
442 assert_eq!(ir.header.row_count, 1);
443 assert_eq!(ir.header.fields.len(), 2);
444 assert_eq!(ir.values.len(), 2);
445 }
446
447 #[test]
448 fn test_array_of_objects() {
449 let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
450 let ir = JsonParser::parse(input).unwrap();
451
452 assert_eq!(ir.header.row_count, 2);
453 assert_eq!(ir.header.fields.len(), 2);
454 assert_eq!(ir.values.len(), 4);
455 }
456
457 #[test]
458 fn test_nested_object() {
459 let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
460 let ir = JsonParser::parse(input).unwrap();
461
462 assert_eq!(ir.header.row_count, 1);
463 assert_eq!(ir.header.fields.len(), 1);
464 assert_eq!(ir.header.fields[0].name, "user჻profile჻name");
465 }
466
467 #[test]
468 fn test_root_key() {
469 let input = r#"{"users":[{"id":1}]}"#;
470 let ir = JsonParser::parse(input).unwrap();
471
472 assert_eq!(ir.header.root_key, Some("users".to_string()));
473 assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
474 }
475
476 #[test]
477 fn test_all_types() {
478 let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
479 let ir = JsonParser::parse(input).unwrap();
480
481 assert_eq!(ir.header.fields.len(), 6);
482 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
483 }
484
485 #[test]
486 fn test_null_handling() {
487 let input = r#"{"name":"alice","age":null}"#;
488 let ir = JsonParser::parse(input).unwrap();
489
490 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
491
492 let age_idx = ir
494 .header
495 .fields
496 .iter()
497 .position(|f| f.name == "age")
498 .unwrap();
499 assert!(ir.is_null(0, age_idx)); }
501
502 #[test]
503 fn test_homogeneous_array() {
504 let input = r#"{"scores":[1,2,3]}"#;
505 let ir = JsonParser::parse(input).unwrap();
506
507 assert_eq!(
508 ir.header.fields[0].field_type,
509 FieldType::Array(Box::new(FieldType::U64))
510 );
511 }
512
513 #[test]
514 fn test_empty_array() {
515 let input = r#"{"items":[]}"#;
516 let ir = JsonParser::parse(input).unwrap();
517
518 assert_eq!(
519 ir.header.fields[0].field_type,
520 FieldType::Array(Box::new(FieldType::Null))
521 );
522 }
523
524 #[test]
525 fn test_deep_nesting() {
526 let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
527 let ir = JsonParser::parse(input).unwrap();
528
529 assert_eq!(ir.header.fields[0].name, "a჻b჻c჻d");
530 }
531
532 #[test]
533 fn test_flatten_object() {
534 let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
535 let flattened = flatten_object(&obj, "");
536
537 assert_eq!(flattened.len(), 1);
538 assert!(flattened.contains_key("a჻b"));
539 }
540
541 #[test]
542 fn test_single_level_nesting() {
543 let input = r#"{"id":"A1","name":"Jim","grade":{"math":60,"physics":66,"chemistry":61}}"#;
544 let ir = JsonParser::parse(input).unwrap();
545
546 assert_eq!(ir.header.row_count, 1);
547 assert_eq!(ir.header.fields.len(), 5);
548
549 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
551 assert!(field_names.contains(&"id".to_string()));
552 assert!(field_names.contains(&"name".to_string()));
553 assert!(field_names.contains(&"grade჻math".to_string()));
554 assert!(field_names.contains(&"grade჻physics".to_string()));
555 assert!(field_names.contains(&"grade჻chemistry".to_string()));
556 }
557
558 #[test]
559 fn test_array_of_nested_objects() {
560 let input = r#"{"students":[{"id":"A1","name":"Jim","grade":{"math":60,"physics":66}}]}"#;
561 let ir = JsonParser::parse(input).unwrap();
562
563 assert_eq!(ir.header.row_count, 1);
564 assert_eq!(ir.header.root_key, Some("students".to_string()));
565
566 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
567 assert!(field_names.contains(&"id".to_string()));
568 assert!(field_names.contains(&"name".to_string()));
569 assert!(field_names.contains(&"grade჻math".to_string()));
570 assert!(field_names.contains(&"grade჻physics".to_string()));
571 }
572
573 #[test]
574 fn test_multiple_nested_levels() {
575 let input = r#"{"data":{"user":{"profile":{"address":{"city":"Boston"}}}}}"#;
576 let ir = JsonParser::parse(input).unwrap();
577
578 assert_eq!(ir.header.fields.len(), 1);
579 assert_eq!(ir.header.fields[0].name, "data჻user჻profile჻address჻city");
580 }
581
582 #[test]
583 fn test_mixed_arrays_and_objects() {
584 let input =
585 r#"{"person":{"name":"Alice","tags":["admin","user"],"address":{"city":"NYC"}}}"#;
586 let ir = JsonParser::parse(input).unwrap();
587
588 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
589 assert!(field_names.contains(&"person჻name".to_string()));
590 assert!(field_names.contains(&"person჻tags".to_string()));
591 assert!(field_names.contains(&"person჻address჻city".to_string()));
592
593 let tags_field = ir
595 .header
596 .fields
597 .iter()
598 .find(|f| f.name == "person჻tags")
599 .unwrap();
600 assert!(matches!(tags_field.field_type, FieldType::Array(_)));
601 }
602}