1use crate::encoders::algorithms::schema::fiche::NEST_SEP;
2use crate::encoders::algorithms::schema::parsers::InputParser;
3use crate::encoders::algorithms::schema::types::*;
4use serde_json::{Map, Value};
5use std::collections::HashMap;
6
7pub struct JsonParser;
8
9impl InputParser for JsonParser {
10 type Error = SchemaError;
11
12 fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
13 let parsed: Value = serde_json::from_str(input).map_err(|e| {
14 SchemaError::InvalidInput(format!(
15 "Invalid JSON syntax: {}\n\
16 Ensure the input is valid JSON.",
17 e
18 ))
19 })?;
20
21 match parsed {
22 Value::Array(arr) => parse_array(arr),
23 Value::Object(obj) => parse_object(obj),
24 _ => Err(SchemaError::InvalidInput(
25 "Expected JSON object or array at root level.\n\
26 Schema encoding works with:\n\
27 - Single object: {\"name\": \"value\"}\n\
28 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
29 - Object with array: {\"users\": [{\"id\": 1}]}"
30 .to_string(),
31 )),
32 }
33 }
34}
35
36fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
38 if arr.is_empty() {
39 return Err(SchemaError::InvalidInput(
40 "Empty array - cannot infer schema from zero rows.\n\
41 Provide at least one object in the array."
42 .to_string(),
43 ));
44 }
45
46 let row_count = arr.len();
47 let mut all_rows: Vec<Map<String, Value>> = Vec::new();
48
49 for (idx, item) in arr.into_iter().enumerate() {
51 match item {
52 Value::Object(obj) => all_rows.push(obj),
53 other => {
54 let type_name = match other {
55 Value::Null => "null",
56 Value::Bool(_) => "boolean",
57 Value::Number(_) => "number",
58 Value::String(_) => "string",
59 Value::Array(_) => "array",
60 Value::Object(_) => unreachable!(),
61 };
62 return Err(SchemaError::InvalidInput(format!(
63 "Array must contain only objects (tabular data). Found {} at index {}.\n\
64 Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
65 type_name, idx
66 )));
67 }
68 }
69 }
70
71 let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
73 let mut all_field_names = std::collections::BTreeSet::new();
74
75 for obj in &all_rows {
76 let flattened = flatten_object(obj, "");
77 for key in flattened.keys() {
78 all_field_names.insert(key.clone());
79 }
80 flattened_rows.push(flattened);
81 }
82
83 let field_names: Vec<String> = all_field_names.into_iter().collect();
84
85 let mut fields = Vec::new();
87 let mut has_nulls = false;
88
89 for field_name in &field_names {
90 let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
91 fields.push(FieldDef::new(field_name.clone(), field_type));
92 }
93
94 let mut values = Vec::new();
96 let total_values = row_count * fields.len();
97 let bitmap_bytes = total_values.div_ceil(8);
98 let mut null_bitmap = vec![0u8; bitmap_bytes];
99
100 for (row_idx, row) in flattened_rows.iter().enumerate() {
101 for (field_idx, field) in fields.iter().enumerate() {
102 let value_idx = row_idx * fields.len() + field_idx;
103
104 if let Some(json_value) = row.get(&field.name)
105 && json_value.is_null()
106 {
107 values.push(SchemaValue::Null);
108 set_null_bit(&mut null_bitmap, value_idx);
109 has_nulls = true;
110 } else if let Some(json_value) = row.get(&field.name) {
111 values.push(json_to_schema_value(json_value, &field.field_type)?);
112 } else {
113 values.push(SchemaValue::Null);
115 set_null_bit(&mut null_bitmap, value_idx);
116 has_nulls = true;
117 }
118 }
119 }
120
121 let mut header = SchemaHeader::new(row_count, fields);
123 if has_nulls {
124 header.null_bitmap = Some(null_bitmap);
125 header.set_flag(FLAG_HAS_NULLS);
126 }
127
128 IntermediateRepresentation::new(header, values)
129}
130
131fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
133 const WRAPPER_KEYS: &[&str] = &["results", "data", "items", "records"];
135
136 let mut array_field: Option<(String, Vec<Value>)> = None;
138 let mut scalar_fields: std::collections::HashMap<String, String> =
139 std::collections::HashMap::new();
140
141 for (key, value) in &obj {
142 match value {
143 Value::Array(arr)
144 if !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_))) =>
145 {
146 if array_field.is_none() {
147 array_field = Some((key.clone(), arr.clone()));
148 } else {
149 array_field = None;
151 scalar_fields.clear();
152 break;
153 }
154 }
155 Value::String(s) => {
156 scalar_fields.insert(key.clone(), s.clone());
157 }
158 Value::Number(n) => {
159 scalar_fields.insert(key.clone(), n.to_string());
160 }
161 Value::Bool(b) => {
162 scalar_fields.insert(key.clone(), b.to_string());
163 }
164 Value::Null => {
165 scalar_fields.insert(key.clone(), "∅".to_string());
167 }
168 _ => {
169 scalar_fields.clear();
171 array_field = None;
172 break;
173 }
174 }
175 }
176
177 if let Some((array_key, arr)) = array_field
179 && !scalar_fields.is_empty()
180 {
181 let mut ir = parse_array(arr)?;
182 ir.header.root_key = Some(array_key);
183 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
184 ir.header.metadata = Some(scalar_fields);
185 return Ok(ir);
186 }
187
188 if obj.len() == 1 {
190 let is_root_key_pattern = obj
192 .values()
193 .next()
194 .map(|v| {
195 if let Value::Array(arr) = v {
196 !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
198 } else {
199 false
200 }
201 })
202 .unwrap_or(false);
203
204 if is_root_key_pattern {
205 let (key, value) = obj.into_iter().next().unwrap();
207 let arr = match value {
209 Value::Array(a) => a,
210 _ => unreachable!(),
211 };
212
213 let mut ir = parse_array(arr)?;
215 ir.header.root_key = Some(key);
216 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
217 return Ok(ir);
218 }
219 }
220
221 for wrapper_key in WRAPPER_KEYS {
223 if let Some(Value::Array(arr)) = obj.get(*wrapper_key)
224 && !arr.is_empty()
225 && arr.iter().all(|item| matches!(item, Value::Object(_)))
226 {
227 let arr = arr.clone();
229 let mut ir = parse_array(arr)?;
230 ir.header.root_key = Some((*wrapper_key).to_string());
231 ir.header.set_flag(FLAG_HAS_ROOT_KEY);
232 return Ok(ir);
233 }
234 }
235
236 let flattened = flatten_object(&obj, "");
238 let mut field_names = Vec::new();
240 collect_field_names_ordered(&obj, "", &mut field_names);
241
242 let mut fields = Vec::new();
243 let mut has_nulls = false;
244
245 for field_name in &field_names {
246 let value = &flattened[field_name];
247 let field_type = infer_type(value);
248 if value.is_null() {
249 has_nulls = true;
250 }
251 fields.push(FieldDef::new(field_name.clone(), field_type));
252 }
253
254 let mut values = Vec::new();
256 let total_values = fields.len();
257 let bitmap_bytes = total_values.div_ceil(8);
258 let mut null_bitmap = vec![0u8; bitmap_bytes];
259
260 for (field_idx, field) in fields.iter().enumerate() {
261 let json_value = &flattened[&field.name];
262 if json_value.is_null() {
263 values.push(SchemaValue::Null);
264 set_null_bit(&mut null_bitmap, field_idx);
265 } else {
266 values.push(json_to_schema_value(json_value, &field.field_type)?);
267 }
268 }
269
270 let mut header = SchemaHeader::new(1, fields);
272 if has_nulls {
273 header.null_bitmap = Some(null_bitmap);
274 header.set_flag(FLAG_HAS_NULLS);
275 }
276
277 IntermediateRepresentation::new(header, values)
278}
279
280fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
282 for (key, value) in obj {
283 let full_key = if prefix.is_empty() {
284 key.clone()
285 } else {
286 format!("{}{}{}", prefix, NEST_SEP, key)
287 };
288
289 match value {
290 Value::Object(nested) => {
291 collect_field_names_ordered(nested, &full_key, names);
292 }
293 _ => {
294 names.push(full_key);
295 }
296 }
297 }
298}
299
300fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
302 let mut result = HashMap::new();
303
304 for (key, value) in obj {
305 let full_key = if prefix.is_empty() {
306 key.clone()
307 } else {
308 format!("{}{}{}", prefix, NEST_SEP, key)
309 };
310
311 match value {
312 Value::Object(nested) => {
313 result.extend(flatten_object(nested, &full_key));
314 }
315 _ => {
316 result.insert(full_key, value.clone());
317 }
318 }
319 }
320
321 result
322}
323
324fn infer_type(value: &Value) -> FieldType {
326 match value {
327 Value::Null => FieldType::Null,
328 Value::Bool(_) => FieldType::Bool,
329 Value::Number(n) => {
330 if n.is_f64() {
331 if let Some(f) = n.as_f64()
333 && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
334 {
335 return FieldType::F64;
336 }
337 }
338
339 if let Some(i) = n.as_i64() {
340 if i < 0 {
341 FieldType::I64
342 } else {
343 FieldType::U64
344 }
345 } else if n.as_u64().is_some() {
346 FieldType::U64
347 } else {
348 FieldType::F64
349 }
350 }
351 Value::String(_) => FieldType::String,
352 Value::Array(arr) => {
353 if arr.is_empty() {
354 FieldType::Array(Box::new(FieldType::Null))
355 } else {
356 let element_type = arr
358 .iter()
359 .find(|v| !v.is_null())
360 .map(infer_type)
361 .unwrap_or(FieldType::Null);
362 FieldType::Array(Box::new(element_type))
363 }
364 }
365 Value::Object(_) => {
366 FieldType::String
368 }
369 }
370}
371
372fn infer_field_type(
374 rows: &[HashMap<String, Value>],
375 field_name: &str,
376 has_nulls: &mut bool,
377) -> Result<FieldType, SchemaError> {
378 let mut inferred_type: Option<FieldType> = None;
379
380 for row in rows {
381 if let Some(value) = row.get(field_name) {
382 if value.is_null() {
383 *has_nulls = true;
384 continue;
385 }
386
387 let current_type = infer_type(value);
388
389 if let Some(ref existing_type) = inferred_type {
390 if let (FieldType::Array(existing_inner), FieldType::Array(current_inner)) =
392 (existing_type, ¤t_type)
393 {
394 if **existing_inner == FieldType::Null && **current_inner != FieldType::Null {
395 inferred_type = Some(current_type.clone());
397 continue;
398 } else if **current_inner == FieldType::Null
399 && **existing_inner != FieldType::Null
400 {
401 continue;
403 }
404 }
405
406 if *existing_type != current_type {
407 return Ok(FieldType::Any);
409 }
410 } else {
411 inferred_type = Some(current_type);
412 }
413 } else {
414 *has_nulls = true;
415 }
416 }
417
418 Ok(inferred_type.unwrap_or(FieldType::Null))
419}
420
421fn json_to_schema_value(
423 value: &Value,
424 expected_type: &FieldType,
425) -> Result<SchemaValue, SchemaError> {
426 match value {
427 Value::Null => Ok(SchemaValue::Null),
428 Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
429 Value::Number(n) => match expected_type {
430 FieldType::U64 | FieldType::Any => {
431 if let Some(u) = n.as_u64() {
432 Ok(SchemaValue::U64(u))
433 } else if let Some(i) = n.as_i64() {
434 Ok(SchemaValue::I64(i))
435 } else {
436 Ok(SchemaValue::F64(n.as_f64().unwrap()))
437 }
438 }
439 FieldType::I64 => {
440 if let Some(i) = n.as_i64() {
441 Ok(SchemaValue::I64(i))
442 } else {
443 Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
444 }
445 }
446 FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
447 _ => Err(SchemaError::InvalidInput(format!(
448 "Type mismatch: expected {}, but found number.\n\
449 The field type was inferred or specified as {}, which doesn't accept numeric values.",
450 expected_type.display_name(),
451 expected_type.display_name()
452 ))),
453 },
454 Value::String(s) => Ok(SchemaValue::String(s.clone())),
455 Value::Array(arr) => {
456 let element_type = if let FieldType::Array(et) = expected_type {
457 et.as_ref()
458 } else {
459 return Err(SchemaError::InvalidInput(format!(
460 "Internal error: Expected array type but found {}. This is a bug in type inference.",
461 expected_type.display_name()
462 )));
463 };
464
465 let mut schema_values = Vec::new();
466 for item in arr {
467 schema_values.push(json_to_schema_value(item, element_type)?);
468 }
469 Ok(SchemaValue::Array(schema_values))
470 }
471 Value::Object(_) => Err(SchemaError::InvalidInput(
472 "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
473 .to_string(),
474 )),
475 }
476}
477
478fn set_null_bit(bitmap: &mut [u8], index: usize) {
480 let byte_idx = index / 8;
481 let bit_idx = index % 8;
482 bitmap[byte_idx] |= 1 << bit_idx;
483}
484
485#[cfg(test)]
486mod tests {
487 use super::*;
488
489 #[test]
490 fn test_simple_object() {
491 let input = r#"{"id":1,"name":"alice"}"#;
492 let ir = JsonParser::parse(input).unwrap();
493
494 assert_eq!(ir.header.row_count, 1);
495 assert_eq!(ir.header.fields.len(), 2);
496 assert_eq!(ir.values.len(), 2);
497 }
498
499 #[test]
500 fn test_array_of_objects() {
501 let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
502 let ir = JsonParser::parse(input).unwrap();
503
504 assert_eq!(ir.header.row_count, 2);
505 assert_eq!(ir.header.fields.len(), 2);
506 assert_eq!(ir.values.len(), 4);
507 }
508
509 #[test]
510 fn test_nested_object() {
511 let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
512 let ir = JsonParser::parse(input).unwrap();
513
514 assert_eq!(ir.header.row_count, 1);
515 assert_eq!(ir.header.fields.len(), 1);
516 assert_eq!(ir.header.fields[0].name, "user჻profile჻name");
517 }
518
519 #[test]
520 fn test_root_key() {
521 let input = r#"{"users":[{"id":1}]}"#;
522 let ir = JsonParser::parse(input).unwrap();
523
524 assert_eq!(ir.header.root_key, Some("users".to_string()));
525 assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
526 }
527
528 #[test]
529 fn test_all_types() {
530 let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
531 let ir = JsonParser::parse(input).unwrap();
532
533 assert_eq!(ir.header.fields.len(), 6);
534 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
535 }
536
537 #[test]
538 fn test_null_handling() {
539 let input = r#"{"name":"alice","age":null}"#;
540 let ir = JsonParser::parse(input).unwrap();
541
542 assert!(ir.header.has_flag(FLAG_HAS_NULLS));
543
544 let age_idx = ir
546 .header
547 .fields
548 .iter()
549 .position(|f| f.name == "age")
550 .unwrap();
551 assert!(ir.is_null(0, age_idx)); }
553
554 #[test]
555 fn test_homogeneous_array() {
556 let input = r#"{"scores":[1,2,3]}"#;
557 let ir = JsonParser::parse(input).unwrap();
558
559 assert_eq!(
560 ir.header.fields[0].field_type,
561 FieldType::Array(Box::new(FieldType::U64))
562 );
563 }
564
565 #[test]
566 fn test_empty_array() {
567 let input = r#"{"items":[]}"#;
568 let ir = JsonParser::parse(input).unwrap();
569
570 assert_eq!(
571 ir.header.fields[0].field_type,
572 FieldType::Array(Box::new(FieldType::Null))
573 );
574 }
575
576 #[test]
577 fn test_deep_nesting() {
578 let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
579 let ir = JsonParser::parse(input).unwrap();
580
581 assert_eq!(ir.header.fields[0].name, "a჻b჻c჻d");
582 }
583
584 #[test]
585 fn test_flatten_object() {
586 let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
587 let flattened = flatten_object(&obj, "");
588
589 assert_eq!(flattened.len(), 1);
590 assert!(flattened.contains_key("a჻b"));
591 }
592
593 #[test]
594 fn test_single_level_nesting() {
595 let input = r#"{"id":"A1","name":"Jim","grade":{"math":60,"physics":66,"chemistry":61}}"#;
596 let ir = JsonParser::parse(input).unwrap();
597
598 assert_eq!(ir.header.row_count, 1);
599 assert_eq!(ir.header.fields.len(), 5);
600
601 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
603 assert!(field_names.contains(&"id".to_string()));
604 assert!(field_names.contains(&"name".to_string()));
605 assert!(field_names.contains(&"grade჻math".to_string()));
606 assert!(field_names.contains(&"grade჻physics".to_string()));
607 assert!(field_names.contains(&"grade჻chemistry".to_string()));
608 }
609
610 #[test]
611 fn test_array_of_nested_objects() {
612 let input = r#"{"students":[{"id":"A1","name":"Jim","grade":{"math":60,"physics":66}}]}"#;
613 let ir = JsonParser::parse(input).unwrap();
614
615 assert_eq!(ir.header.row_count, 1);
616 assert_eq!(ir.header.root_key, Some("students".to_string()));
617
618 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
619 assert!(field_names.contains(&"id".to_string()));
620 assert!(field_names.contains(&"name".to_string()));
621 assert!(field_names.contains(&"grade჻math".to_string()));
622 assert!(field_names.contains(&"grade჻physics".to_string()));
623 }
624
625 #[test]
626 fn test_multiple_nested_levels() {
627 let input = r#"{"data":{"user":{"profile":{"address":{"city":"Boston"}}}}}"#;
628 let ir = JsonParser::parse(input).unwrap();
629
630 assert_eq!(ir.header.fields.len(), 1);
631 assert_eq!(ir.header.fields[0].name, "data჻user჻profile჻address჻city");
632 }
633
634 #[test]
635 fn test_mixed_arrays_and_objects() {
636 let input =
637 r#"{"person":{"name":"Alice","tags":["admin","user"],"address":{"city":"NYC"}}}"#;
638 let ir = JsonParser::parse(input).unwrap();
639
640 let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
641 assert!(field_names.contains(&"person჻name".to_string()));
642 assert!(field_names.contains(&"person჻tags".to_string()));
643 assert!(field_names.contains(&"person჻address჻city".to_string()));
644
645 let tags_field = ir
647 .header
648 .fields
649 .iter()
650 .find(|f| f.name == "person჻tags")
651 .unwrap();
652 assert!(matches!(tags_field.field_type, FieldType::Array(_)));
653 }
654
655 #[test]
656 fn test_metadata_pattern() {
657 let input = r#"{"school_name": "Springfield High", "class": "Year 1", "students": [{"id": "A1"}, {"id": "B2"}]}"#;
658 let ir = JsonParser::parse(input).unwrap();
659
660 assert!(ir.header.metadata.is_some());
662 let metadata = ir.header.metadata.as_ref().unwrap();
663 assert_eq!(
664 metadata.get("school_name"),
665 Some(&"Springfield High".to_string())
666 );
667 assert_eq!(metadata.get("class"), Some(&"Year 1".to_string()));
668
669 assert_eq!(ir.header.root_key, Some("students".to_string()));
671 assert_eq!(ir.header.row_count, 2);
672 assert_eq!(ir.header.fields.len(), 1);
673 assert_eq!(ir.header.fields[0].name, "id");
674 }
675
676 #[test]
677 fn test_metadata_with_null() {
678 let input = r#"{"note": null, "total": 2, "users": [{"id": 1}, {"id": 2}]}"#;
679 let ir = JsonParser::parse(input).unwrap();
680
681 assert!(ir.header.metadata.is_some());
683 let metadata = ir.header.metadata.as_ref().unwrap();
684 assert_eq!(metadata.get("note"), Some(&"∅".to_string()));
685 assert_eq!(metadata.get("total"), Some(&"2".to_string()));
686
687 assert_eq!(ir.header.root_key, Some("users".to_string()));
689 assert_eq!(ir.header.row_count, 2);
690 assert_eq!(ir.header.fields.len(), 1);
691 assert_eq!(ir.header.fields[0].name, "id");
692 }
693}