base_d/encoders/algorithms/schema/parsers/
json.rs

1use crate::encoders::algorithms::schema::fiche::NEST_SEP;
2use crate::encoders::algorithms::schema::parsers::InputParser;
3use crate::encoders::algorithms::schema::types::*;
4use serde_json::{Map, Value};
5use std::collections::HashMap;
6
7pub struct JsonParser;
8
9impl InputParser for JsonParser {
10    type Error = SchemaError;
11
12    fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
13        let parsed: Value = serde_json::from_str(input).map_err(|e| {
14            SchemaError::InvalidInput(format!(
15                "Invalid JSON syntax: {}\n\
16                 Ensure the input is valid JSON.",
17                e
18            ))
19        })?;
20
21        match parsed {
22            Value::Array(arr) => parse_array(arr),
23            Value::Object(obj) => parse_object(obj),
24            _ => Err(SchemaError::InvalidInput(
25                "Expected JSON object or array at root level.\n\
26                 Schema encoding works with:\n\
27                 - Single object: {\"name\": \"value\"}\n\
28                 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
29                 - Object with array: {\"users\": [{\"id\": 1}]}"
30                    .to_string(),
31            )),
32        }
33    }
34}
35
36/// Parse array of objects (tabular data)
37fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
38    if arr.is_empty() {
39        return Err(SchemaError::InvalidInput(
40            "Empty array - cannot infer schema from zero rows.\n\
41             Provide at least one object in the array."
42                .to_string(),
43        ));
44    }
45
46    let row_count = arr.len();
47    let mut all_rows: Vec<Map<String, Value>> = Vec::new();
48
49    // Extract objects from array
50    for (idx, item) in arr.into_iter().enumerate() {
51        match item {
52            Value::Object(obj) => all_rows.push(obj),
53            other => {
54                let type_name = match other {
55                    Value::Null => "null",
56                    Value::Bool(_) => "boolean",
57                    Value::Number(_) => "number",
58                    Value::String(_) => "string",
59                    Value::Array(_) => "array",
60                    Value::Object(_) => unreachable!(),
61                };
62                return Err(SchemaError::InvalidInput(format!(
63                    "Array must contain only objects (tabular data). Found {} at index {}.\n\
64                     Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
65                    type_name, idx
66                )));
67            }
68        }
69    }
70
71    // Flatten all objects and collect field names
72    let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
73    let mut all_field_names = std::collections::BTreeSet::new();
74
75    for obj in &all_rows {
76        let flattened = flatten_object(obj, "");
77        for key in flattened.keys() {
78            all_field_names.insert(key.clone());
79        }
80        flattened_rows.push(flattened);
81    }
82
83    let field_names: Vec<String> = all_field_names.into_iter().collect();
84
85    // Infer types and build fields
86    let mut fields = Vec::new();
87    let mut has_nulls = false;
88
89    for field_name in &field_names {
90        let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
91        fields.push(FieldDef::new(field_name.clone(), field_type));
92    }
93
94    // Build values and null bitmap
95    let mut values = Vec::new();
96    let total_values = row_count * fields.len();
97    let bitmap_bytes = total_values.div_ceil(8);
98    let mut null_bitmap = vec![0u8; bitmap_bytes];
99
100    for (row_idx, row) in flattened_rows.iter().enumerate() {
101        for (field_idx, field) in fields.iter().enumerate() {
102            let value_idx = row_idx * fields.len() + field_idx;
103
104            if let Some(json_value) = row.get(&field.name)
105                && json_value.is_null()
106            {
107                values.push(SchemaValue::Null);
108                set_null_bit(&mut null_bitmap, value_idx);
109                has_nulls = true;
110            } else if let Some(json_value) = row.get(&field.name) {
111                values.push(json_to_schema_value(json_value, &field.field_type)?);
112            } else {
113                // Missing field = null
114                values.push(SchemaValue::Null);
115                set_null_bit(&mut null_bitmap, value_idx);
116                has_nulls = true;
117            }
118        }
119    }
120
121    // Build header
122    let mut header = SchemaHeader::new(row_count, fields);
123    if has_nulls {
124        header.null_bitmap = Some(null_bitmap);
125        header.set_flag(FLAG_HAS_NULLS);
126    }
127
128    IntermediateRepresentation::new(header, values)
129}
130
131/// Parse single object (may have root key)
132fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
133    // Check for common pagination wrapper keys
134    const WRAPPER_KEYS: &[&str] = &["results", "data", "items", "records"];
135
136    // Check for metadata pattern: scalar fields + one array field
137    let mut array_field: Option<(String, Vec<Value>)> = None;
138    let mut scalar_fields: std::collections::HashMap<String, String> =
139        std::collections::HashMap::new();
140
141    for (key, value) in &obj {
142        match value {
143            Value::Array(arr)
144                if !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_))) =>
145            {
146                if array_field.is_none() {
147                    array_field = Some((key.clone(), arr.clone()));
148                } else {
149                    // Multiple arrays - not metadata pattern
150                    array_field = None;
151                    scalar_fields.clear();
152                    break;
153                }
154            }
155            Value::String(s) => {
156                scalar_fields.insert(key.clone(), s.clone());
157            }
158            Value::Number(n) => {
159                scalar_fields.insert(key.clone(), n.to_string());
160            }
161            Value::Bool(b) => {
162                scalar_fields.insert(key.clone(), b.to_string());
163            }
164            Value::Null => {
165                // Encode null metadata as ∅ symbol
166                scalar_fields.insert(key.clone(), "∅".to_string());
167            }
168            _ => {
169                // Non-scalar or nested object - not metadata pattern
170                scalar_fields.clear();
171                array_field = None;
172                break;
173            }
174        }
175    }
176
177    // If we have exactly one array field and at least one scalar field, extract metadata
178    if let Some((array_key, arr)) = array_field
179        && !scalar_fields.is_empty()
180    {
181        let mut ir = parse_array(arr)?;
182        ir.header.root_key = Some(array_key);
183        ir.header.set_flag(FLAG_HAS_ROOT_KEY);
184        ir.header.metadata = Some(scalar_fields);
185        return Ok(ir);
186    }
187
188    // Check if this is a wrapper object with one of the known keys
189    if obj.len() == 1 {
190        // Check if value is an array of objects before consuming
191        let is_root_key_pattern = obj
192            .values()
193            .next()
194            .map(|v| {
195                if let Value::Array(arr) = v {
196                    // Only treat as root key if array contains objects (tabular data)
197                    !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
198                } else {
199                    false
200                }
201            })
202            .unwrap_or(false);
203
204        if is_root_key_pattern {
205            // Extract key and value by consuming the map
206            let (key, value) = obj.into_iter().next().unwrap();
207            // We already checked it's an array
208            let arr = match value {
209                Value::Array(a) => a,
210                _ => unreachable!(),
211            };
212
213            // Parse as array with root key
214            let mut ir = parse_array(arr)?;
215            ir.header.root_key = Some(key);
216            ir.header.set_flag(FLAG_HAS_ROOT_KEY);
217            return Ok(ir);
218        }
219    }
220
221    // Check for known wrapper patterns and unwrap them
222    for wrapper_key in WRAPPER_KEYS {
223        if let Some(Value::Array(arr)) = obj.get(*wrapper_key)
224            && !arr.is_empty()
225            && arr.iter().all(|item| matches!(item, Value::Object(_)))
226        {
227            // Found a wrapper key - unwrap and parse the array
228            let arr = arr.clone();
229            let mut ir = parse_array(arr)?;
230            ir.header.root_key = Some((*wrapper_key).to_string());
231            ir.header.set_flag(FLAG_HAS_ROOT_KEY);
232            return Ok(ir);
233        }
234    }
235
236    // Single object - treat as single row
237    let flattened = flatten_object(&obj, "");
238    // Preserve field order from original object (serde_json preserves insertion order)
239    let mut field_names = Vec::new();
240    collect_field_names_ordered(&obj, "", &mut field_names);
241
242    let mut fields = Vec::new();
243    let mut has_nulls = false;
244
245    for field_name in &field_names {
246        let value = &flattened[field_name];
247        let field_type = infer_type(value);
248        if value.is_null() {
249            has_nulls = true;
250        }
251        fields.push(FieldDef::new(field_name.clone(), field_type));
252    }
253
254    // Build values and null bitmap
255    let mut values = Vec::new();
256    let total_values = fields.len();
257    let bitmap_bytes = total_values.div_ceil(8);
258    let mut null_bitmap = vec![0u8; bitmap_bytes];
259
260    for (field_idx, field) in fields.iter().enumerate() {
261        let json_value = &flattened[&field.name];
262        if json_value.is_null() {
263            values.push(SchemaValue::Null);
264            set_null_bit(&mut null_bitmap, field_idx);
265        } else {
266            values.push(json_to_schema_value(json_value, &field.field_type)?);
267        }
268    }
269
270    // Build header
271    let mut header = SchemaHeader::new(1, fields);
272    if has_nulls {
273        header.null_bitmap = Some(null_bitmap);
274        header.set_flag(FLAG_HAS_NULLS);
275    }
276
277    IntermediateRepresentation::new(header, values)
278}
279
280/// Collect field names in order from nested object
281fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
282    for (key, value) in obj {
283        let full_key = if prefix.is_empty() {
284            key.clone()
285        } else {
286            format!("{}{}{}", prefix, NEST_SEP, key)
287        };
288
289        match value {
290            Value::Object(nested) => {
291                collect_field_names_ordered(nested, &full_key, names);
292            }
293            _ => {
294                names.push(full_key);
295            }
296        }
297    }
298}
299
300/// Flatten nested object with NEST_SEP delimiter
301fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
302    let mut result = HashMap::new();
303
304    for (key, value) in obj {
305        let full_key = if prefix.is_empty() {
306            key.clone()
307        } else {
308            format!("{}{}{}", prefix, NEST_SEP, key)
309        };
310
311        match value {
312            Value::Object(nested) => {
313                result.extend(flatten_object(nested, &full_key));
314            }
315            _ => {
316                result.insert(full_key, value.clone());
317            }
318        }
319    }
320
321    result
322}
323
324/// Infer type from a single JSON value
325fn infer_type(value: &Value) -> FieldType {
326    match value {
327        Value::Null => FieldType::Null,
328        Value::Bool(_) => FieldType::Bool,
329        Value::Number(n) => {
330            if n.is_f64() {
331                // Check if it has a fractional part
332                if let Some(f) = n.as_f64()
333                    && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
334                {
335                    return FieldType::F64;
336                }
337            }
338
339            if let Some(i) = n.as_i64() {
340                if i < 0 {
341                    FieldType::I64
342                } else {
343                    FieldType::U64
344                }
345            } else if n.as_u64().is_some() {
346                FieldType::U64
347            } else {
348                FieldType::F64
349            }
350        }
351        Value::String(_) => FieldType::String,
352        Value::Array(arr) => {
353            if arr.is_empty() {
354                FieldType::Array(Box::new(FieldType::Null))
355            } else {
356                // Infer from first non-null element
357                let element_type = arr
358                    .iter()
359                    .find(|v| !v.is_null())
360                    .map(infer_type)
361                    .unwrap_or(FieldType::Null);
362                FieldType::Array(Box::new(element_type))
363            }
364        }
365        Value::Object(_) => {
366            // This shouldn't happen after flattening
367            FieldType::String
368        }
369    }
370}
371
372/// Infer field type across multiple rows
373fn infer_field_type(
374    rows: &[HashMap<String, Value>],
375    field_name: &str,
376    has_nulls: &mut bool,
377) -> Result<FieldType, SchemaError> {
378    let mut inferred_type: Option<FieldType> = None;
379
380    for row in rows {
381        if let Some(value) = row.get(field_name) {
382            if value.is_null() {
383                *has_nulls = true;
384                continue;
385            }
386
387            let current_type = infer_type(value);
388
389            if let Some(ref existing_type) = inferred_type {
390                // Special case: Array(Null) unifies with Array(T) → Array(T)
391                if let (FieldType::Array(existing_inner), FieldType::Array(current_inner)) =
392                    (existing_type, &current_type)
393                {
394                    if **existing_inner == FieldType::Null && **current_inner != FieldType::Null {
395                        // Upgrade from Array(Null) to Array(T)
396                        inferred_type = Some(current_type.clone());
397                        continue;
398                    } else if **current_inner == FieldType::Null
399                        && **existing_inner != FieldType::Null
400                    {
401                        // Keep existing Array(T), ignore Array(Null)
402                        continue;
403                    }
404                }
405
406                if *existing_type != current_type {
407                    // Type conflict - use Any
408                    return Ok(FieldType::Any);
409                }
410            } else {
411                inferred_type = Some(current_type);
412            }
413        } else {
414            *has_nulls = true;
415        }
416    }
417
418    Ok(inferred_type.unwrap_or(FieldType::Null))
419}
420
421/// Convert JSON value to SchemaValue
422fn json_to_schema_value(
423    value: &Value,
424    expected_type: &FieldType,
425) -> Result<SchemaValue, SchemaError> {
426    match value {
427        Value::Null => Ok(SchemaValue::Null),
428        Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
429        Value::Number(n) => match expected_type {
430            FieldType::U64 | FieldType::Any => {
431                if let Some(u) = n.as_u64() {
432                    Ok(SchemaValue::U64(u))
433                } else if let Some(i) = n.as_i64() {
434                    Ok(SchemaValue::I64(i))
435                } else {
436                    Ok(SchemaValue::F64(n.as_f64().unwrap()))
437                }
438            }
439            FieldType::I64 => {
440                if let Some(i) = n.as_i64() {
441                    Ok(SchemaValue::I64(i))
442                } else {
443                    Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
444                }
445            }
446            FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
447            _ => Err(SchemaError::InvalidInput(format!(
448                "Type mismatch: expected {}, but found number.\n\
449                 The field type was inferred or specified as {}, which doesn't accept numeric values.",
450                expected_type.display_name(),
451                expected_type.display_name()
452            ))),
453        },
454        Value::String(s) => Ok(SchemaValue::String(s.clone())),
455        Value::Array(arr) => {
456            let element_type = if let FieldType::Array(et) = expected_type {
457                et.as_ref()
458            } else {
459                return Err(SchemaError::InvalidInput(format!(
460                    "Internal error: Expected array type but found {}. This is a bug in type inference.",
461                    expected_type.display_name()
462                )));
463            };
464
465            let mut schema_values = Vec::new();
466            for item in arr {
467                schema_values.push(json_to_schema_value(item, element_type)?);
468            }
469            Ok(SchemaValue::Array(schema_values))
470        }
471        Value::Object(_) => Err(SchemaError::InvalidInput(
472            "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
473                .to_string(),
474        )),
475    }
476}
477
478/// Set a bit in the null bitmap
479fn set_null_bit(bitmap: &mut [u8], index: usize) {
480    let byte_idx = index / 8;
481    let bit_idx = index % 8;
482    bitmap[byte_idx] |= 1 << bit_idx;
483}
484
485#[cfg(test)]
486mod tests {
487    use super::*;
488
489    #[test]
490    fn test_simple_object() {
491        let input = r#"{"id":1,"name":"alice"}"#;
492        let ir = JsonParser::parse(input).unwrap();
493
494        assert_eq!(ir.header.row_count, 1);
495        assert_eq!(ir.header.fields.len(), 2);
496        assert_eq!(ir.values.len(), 2);
497    }
498
499    #[test]
500    fn test_array_of_objects() {
501        let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
502        let ir = JsonParser::parse(input).unwrap();
503
504        assert_eq!(ir.header.row_count, 2);
505        assert_eq!(ir.header.fields.len(), 2);
506        assert_eq!(ir.values.len(), 4);
507    }
508
509    #[test]
510    fn test_nested_object() {
511        let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
512        let ir = JsonParser::parse(input).unwrap();
513
514        assert_eq!(ir.header.row_count, 1);
515        assert_eq!(ir.header.fields.len(), 1);
516        assert_eq!(ir.header.fields[0].name, "user჻profile჻name");
517    }
518
519    #[test]
520    fn test_root_key() {
521        let input = r#"{"users":[{"id":1}]}"#;
522        let ir = JsonParser::parse(input).unwrap();
523
524        assert_eq!(ir.header.root_key, Some("users".to_string()));
525        assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
526    }
527
528    #[test]
529    fn test_all_types() {
530        let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
531        let ir = JsonParser::parse(input).unwrap();
532
533        assert_eq!(ir.header.fields.len(), 6);
534        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
535    }
536
537    #[test]
538    fn test_null_handling() {
539        let input = r#"{"name":"alice","age":null}"#;
540        let ir = JsonParser::parse(input).unwrap();
541
542        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
543
544        // Find which field is "age"
545        let age_idx = ir
546            .header
547            .fields
548            .iter()
549            .position(|f| f.name == "age")
550            .unwrap();
551        assert!(ir.is_null(0, age_idx)); // age field is null
552    }
553
554    #[test]
555    fn test_homogeneous_array() {
556        let input = r#"{"scores":[1,2,3]}"#;
557        let ir = JsonParser::parse(input).unwrap();
558
559        assert_eq!(
560            ir.header.fields[0].field_type,
561            FieldType::Array(Box::new(FieldType::U64))
562        );
563    }
564
565    #[test]
566    fn test_empty_array() {
567        let input = r#"{"items":[]}"#;
568        let ir = JsonParser::parse(input).unwrap();
569
570        assert_eq!(
571            ir.header.fields[0].field_type,
572            FieldType::Array(Box::new(FieldType::Null))
573        );
574    }
575
576    #[test]
577    fn test_deep_nesting() {
578        let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
579        let ir = JsonParser::parse(input).unwrap();
580
581        assert_eq!(ir.header.fields[0].name, "a჻b჻c჻d");
582    }
583
584    #[test]
585    fn test_flatten_object() {
586        let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
587        let flattened = flatten_object(&obj, "");
588
589        assert_eq!(flattened.len(), 1);
590        assert!(flattened.contains_key("a჻b"));
591    }
592
593    #[test]
594    fn test_single_level_nesting() {
595        let input = r#"{"id":"A1","name":"Jim","grade":{"math":60,"physics":66,"chemistry":61}}"#;
596        let ir = JsonParser::parse(input).unwrap();
597
598        assert_eq!(ir.header.row_count, 1);
599        assert_eq!(ir.header.fields.len(), 5);
600
601        // Check field names
602        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
603        assert!(field_names.contains(&"id".to_string()));
604        assert!(field_names.contains(&"name".to_string()));
605        assert!(field_names.contains(&"grade჻math".to_string()));
606        assert!(field_names.contains(&"grade჻physics".to_string()));
607        assert!(field_names.contains(&"grade჻chemistry".to_string()));
608    }
609
610    #[test]
611    fn test_array_of_nested_objects() {
612        let input = r#"{"students":[{"id":"A1","name":"Jim","grade":{"math":60,"physics":66}}]}"#;
613        let ir = JsonParser::parse(input).unwrap();
614
615        assert_eq!(ir.header.row_count, 1);
616        assert_eq!(ir.header.root_key, Some("students".to_string()));
617
618        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
619        assert!(field_names.contains(&"id".to_string()));
620        assert!(field_names.contains(&"name".to_string()));
621        assert!(field_names.contains(&"grade჻math".to_string()));
622        assert!(field_names.contains(&"grade჻physics".to_string()));
623    }
624
625    #[test]
626    fn test_multiple_nested_levels() {
627        let input = r#"{"data":{"user":{"profile":{"address":{"city":"Boston"}}}}}"#;
628        let ir = JsonParser::parse(input).unwrap();
629
630        assert_eq!(ir.header.fields.len(), 1);
631        assert_eq!(ir.header.fields[0].name, "data჻user჻profile჻address჻city");
632    }
633
634    #[test]
635    fn test_mixed_arrays_and_objects() {
636        let input =
637            r#"{"person":{"name":"Alice","tags":["admin","user"],"address":{"city":"NYC"}}}"#;
638        let ir = JsonParser::parse(input).unwrap();
639
640        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
641        assert!(field_names.contains(&"person჻name".to_string()));
642        assert!(field_names.contains(&"person჻tags".to_string()));
643        assert!(field_names.contains(&"person჻address჻city".to_string()));
644
645        // Verify tags is an array type
646        let tags_field = ir
647            .header
648            .fields
649            .iter()
650            .find(|f| f.name == "person჻tags")
651            .unwrap();
652        assert!(matches!(tags_field.field_type, FieldType::Array(_)));
653    }
654
655    #[test]
656    fn test_metadata_pattern() {
657        let input = r#"{"school_name": "Springfield High", "class": "Year 1", "students": [{"id": "A1"}, {"id": "B2"}]}"#;
658        let ir = JsonParser::parse(input).unwrap();
659
660        // Should extract metadata
661        assert!(ir.header.metadata.is_some());
662        let metadata = ir.header.metadata.as_ref().unwrap();
663        assert_eq!(
664            metadata.get("school_name"),
665            Some(&"Springfield High".to_string())
666        );
667        assert_eq!(metadata.get("class"), Some(&"Year 1".to_string()));
668
669        // Array becomes the data rows
670        assert_eq!(ir.header.root_key, Some("students".to_string()));
671        assert_eq!(ir.header.row_count, 2);
672        assert_eq!(ir.header.fields.len(), 1);
673        assert_eq!(ir.header.fields[0].name, "id");
674    }
675
676    #[test]
677    fn test_metadata_with_null() {
678        let input = r#"{"note": null, "total": 2, "users": [{"id": 1}, {"id": 2}]}"#;
679        let ir = JsonParser::parse(input).unwrap();
680
681        // Should extract metadata including null
682        assert!(ir.header.metadata.is_some());
683        let metadata = ir.header.metadata.as_ref().unwrap();
684        assert_eq!(metadata.get("note"), Some(&"∅".to_string()));
685        assert_eq!(metadata.get("total"), Some(&"2".to_string()));
686
687        // Array data
688        assert_eq!(ir.header.root_key, Some("users".to_string()));
689        assert_eq!(ir.header.row_count, 2);
690        assert_eq!(ir.header.fields.len(), 1);
691        assert_eq!(ir.header.fields[0].name, "id");
692    }
693}