base_d/encoders/algorithms/schema/parsers/
json.rs

1use crate::encoders::algorithms::schema::fiche::NEST_SEP;
2use crate::encoders::algorithms::schema::parsers::InputParser;
3use crate::encoders::algorithms::schema::types::*;
4use serde_json::{Map, Value};
5use std::collections::HashMap;
6
7pub struct JsonParser;
8
9impl InputParser for JsonParser {
10    type Error = SchemaError;
11
12    fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
13        let parsed: Value = serde_json::from_str(input).map_err(|e| {
14            SchemaError::InvalidInput(format!(
15                "Invalid JSON syntax: {}\n\
16                 Ensure the input is valid JSON.",
17                e
18            ))
19        })?;
20
21        match parsed {
22            Value::Array(arr) => parse_array(arr),
23            Value::Object(obj) => parse_object(obj),
24            _ => Err(SchemaError::InvalidInput(
25                "Expected JSON object or array at root level.\n\
26                 Schema encoding works with:\n\
27                 - Single object: {\"name\": \"value\"}\n\
28                 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
29                 - Object with array: {\"users\": [{\"id\": 1}]}"
30                    .to_string(),
31            )),
32        }
33    }
34}
35
36/// Parse array of objects (tabular data)
37fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
38    if arr.is_empty() {
39        return Err(SchemaError::InvalidInput(
40            "Empty array - cannot infer schema from zero rows.\n\
41             Provide at least one object in the array."
42                .to_string(),
43        ));
44    }
45
46    let row_count = arr.len();
47    let mut all_rows: Vec<Map<String, Value>> = Vec::new();
48
49    // Extract objects from array
50    for (idx, item) in arr.into_iter().enumerate() {
51        match item {
52            Value::Object(obj) => all_rows.push(obj),
53            other => {
54                let type_name = match other {
55                    Value::Null => "null",
56                    Value::Bool(_) => "boolean",
57                    Value::Number(_) => "number",
58                    Value::String(_) => "string",
59                    Value::Array(_) => "array",
60                    Value::Object(_) => unreachable!(),
61                };
62                return Err(SchemaError::InvalidInput(format!(
63                    "Array must contain only objects (tabular data). Found {} at index {}.\n\
64                     Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
65                    type_name, idx
66                )));
67            }
68        }
69    }
70
71    // Flatten all objects and collect field names
72    let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
73    let mut all_field_names = std::collections::BTreeSet::new();
74
75    for obj in &all_rows {
76        let flattened = flatten_object(obj, "");
77        for key in flattened.keys() {
78            all_field_names.insert(key.clone());
79        }
80        flattened_rows.push(flattened);
81    }
82
83    let field_names: Vec<String> = all_field_names.into_iter().collect();
84
85    // Infer types and build fields
86    let mut fields = Vec::new();
87    let mut has_nulls = false;
88
89    for field_name in &field_names {
90        let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
91        fields.push(FieldDef::new(field_name.clone(), field_type));
92    }
93
94    // Build values and null bitmap
95    let mut values = Vec::new();
96    let total_values = row_count * fields.len();
97    let bitmap_bytes = total_values.div_ceil(8);
98    let mut null_bitmap = vec![0u8; bitmap_bytes];
99
100    for (row_idx, row) in flattened_rows.iter().enumerate() {
101        for (field_idx, field) in fields.iter().enumerate() {
102            let value_idx = row_idx * fields.len() + field_idx;
103
104            if let Some(json_value) = row.get(&field.name)
105                && json_value.is_null()
106            {
107                values.push(SchemaValue::Null);
108                set_null_bit(&mut null_bitmap, value_idx);
109                has_nulls = true;
110            } else if let Some(json_value) = row.get(&field.name) {
111                values.push(json_to_schema_value(json_value, &field.field_type)?);
112            } else {
113                // Missing field = null
114                values.push(SchemaValue::Null);
115                set_null_bit(&mut null_bitmap, value_idx);
116                has_nulls = true;
117            }
118        }
119    }
120
121    // Build header
122    let mut header = SchemaHeader::new(row_count, fields);
123    if has_nulls {
124        header.null_bitmap = Some(null_bitmap);
125        header.set_flag(FLAG_HAS_NULLS);
126    }
127
128    IntermediateRepresentation::new(header, values)
129}
130
131/// Parse single object (may have root key)
132fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
133    // Check for common pagination wrapper keys
134    const WRAPPER_KEYS: &[&str] = &["results", "data", "items", "records"];
135
136    // Check if this is a wrapper object with one of the known keys
137    if obj.len() == 1 {
138        // Check if value is an array of objects before consuming
139        let is_root_key_pattern = obj
140            .values()
141            .next()
142            .map(|v| {
143                if let Value::Array(arr) = v {
144                    // Only treat as root key if array contains objects (tabular data)
145                    !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
146                } else {
147                    false
148                }
149            })
150            .unwrap_or(false);
151
152        if is_root_key_pattern {
153            // Extract key and value by consuming the map
154            let (key, value) = obj.into_iter().next().unwrap();
155            // We already checked it's an array
156            let arr = match value {
157                Value::Array(a) => a,
158                _ => unreachable!(),
159            };
160
161            // Parse as array with root key
162            let mut ir = parse_array(arr)?;
163            ir.header.root_key = Some(key);
164            ir.header.set_flag(FLAG_HAS_ROOT_KEY);
165            return Ok(ir);
166        }
167    }
168
169    // Check for known wrapper patterns and unwrap them
170    for wrapper_key in WRAPPER_KEYS {
171        if let Some(Value::Array(arr)) = obj.get(*wrapper_key)
172            && !arr.is_empty()
173            && arr.iter().all(|item| matches!(item, Value::Object(_)))
174        {
175            // Found a wrapper key - unwrap and parse the array
176            let arr = arr.clone();
177            let mut ir = parse_array(arr)?;
178            ir.header.root_key = Some((*wrapper_key).to_string());
179            ir.header.set_flag(FLAG_HAS_ROOT_KEY);
180            return Ok(ir);
181        }
182    }
183
184    // Single object - treat as single row
185    let flattened = flatten_object(&obj, "");
186    // Preserve field order from original object (serde_json preserves insertion order)
187    let mut field_names = Vec::new();
188    collect_field_names_ordered(&obj, "", &mut field_names);
189
190    let mut fields = Vec::new();
191    let mut has_nulls = false;
192
193    for field_name in &field_names {
194        let value = &flattened[field_name];
195        let field_type = infer_type(value);
196        if value.is_null() {
197            has_nulls = true;
198        }
199        fields.push(FieldDef::new(field_name.clone(), field_type));
200    }
201
202    // Build values and null bitmap
203    let mut values = Vec::new();
204    let total_values = fields.len();
205    let bitmap_bytes = total_values.div_ceil(8);
206    let mut null_bitmap = vec![0u8; bitmap_bytes];
207
208    for (field_idx, field) in fields.iter().enumerate() {
209        let json_value = &flattened[&field.name];
210        if json_value.is_null() {
211            values.push(SchemaValue::Null);
212            set_null_bit(&mut null_bitmap, field_idx);
213        } else {
214            values.push(json_to_schema_value(json_value, &field.field_type)?);
215        }
216    }
217
218    // Build header
219    let mut header = SchemaHeader::new(1, fields);
220    if has_nulls {
221        header.null_bitmap = Some(null_bitmap);
222        header.set_flag(FLAG_HAS_NULLS);
223    }
224
225    IntermediateRepresentation::new(header, values)
226}
227
228/// Collect field names in order from nested object
229fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
230    for (key, value) in obj {
231        let full_key = if prefix.is_empty() {
232            key.clone()
233        } else {
234            format!("{}{}{}", prefix, NEST_SEP, key)
235        };
236
237        match value {
238            Value::Object(nested) => {
239                collect_field_names_ordered(nested, &full_key, names);
240            }
241            _ => {
242                names.push(full_key);
243            }
244        }
245    }
246}
247
248/// Flatten nested object with NEST_SEP delimiter
249fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
250    let mut result = HashMap::new();
251
252    for (key, value) in obj {
253        let full_key = if prefix.is_empty() {
254            key.clone()
255        } else {
256            format!("{}{}{}", prefix, NEST_SEP, key)
257        };
258
259        match value {
260            Value::Object(nested) => {
261                result.extend(flatten_object(nested, &full_key));
262            }
263            _ => {
264                result.insert(full_key, value.clone());
265            }
266        }
267    }
268
269    result
270}
271
272/// Infer type from a single JSON value
273fn infer_type(value: &Value) -> FieldType {
274    match value {
275        Value::Null => FieldType::Null,
276        Value::Bool(_) => FieldType::Bool,
277        Value::Number(n) => {
278            if n.is_f64() {
279                // Check if it has a fractional part
280                if let Some(f) = n.as_f64()
281                    && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
282                {
283                    return FieldType::F64;
284                }
285            }
286
287            if let Some(i) = n.as_i64() {
288                if i < 0 {
289                    FieldType::I64
290                } else {
291                    FieldType::U64
292                }
293            } else if n.as_u64().is_some() {
294                FieldType::U64
295            } else {
296                FieldType::F64
297            }
298        }
299        Value::String(_) => FieldType::String,
300        Value::Array(arr) => {
301            if arr.is_empty() {
302                FieldType::Array(Box::new(FieldType::Null))
303            } else {
304                // Infer from first non-null element
305                let element_type = arr
306                    .iter()
307                    .find(|v| !v.is_null())
308                    .map(infer_type)
309                    .unwrap_or(FieldType::Null);
310                FieldType::Array(Box::new(element_type))
311            }
312        }
313        Value::Object(_) => {
314            // This shouldn't happen after flattening
315            FieldType::String
316        }
317    }
318}
319
320/// Infer field type across multiple rows
321fn infer_field_type(
322    rows: &[HashMap<String, Value>],
323    field_name: &str,
324    has_nulls: &mut bool,
325) -> Result<FieldType, SchemaError> {
326    let mut inferred_type: Option<FieldType> = None;
327
328    for row in rows {
329        if let Some(value) = row.get(field_name) {
330            if value.is_null() {
331                *has_nulls = true;
332                continue;
333            }
334
335            let current_type = infer_type(value);
336
337            if let Some(ref existing_type) = inferred_type {
338                // Special case: Array(Null) unifies with Array(T) → Array(T)
339                if let (FieldType::Array(existing_inner), FieldType::Array(current_inner)) =
340                    (existing_type, &current_type)
341                {
342                    if **existing_inner == FieldType::Null && **current_inner != FieldType::Null {
343                        // Upgrade from Array(Null) to Array(T)
344                        inferred_type = Some(current_type.clone());
345                        continue;
346                    } else if **current_inner == FieldType::Null
347                        && **existing_inner != FieldType::Null
348                    {
349                        // Keep existing Array(T), ignore Array(Null)
350                        continue;
351                    }
352                }
353
354                if *existing_type != current_type {
355                    // Type conflict - use Any
356                    return Ok(FieldType::Any);
357                }
358            } else {
359                inferred_type = Some(current_type);
360            }
361        } else {
362            *has_nulls = true;
363        }
364    }
365
366    Ok(inferred_type.unwrap_or(FieldType::Null))
367}
368
369/// Convert JSON value to SchemaValue
370fn json_to_schema_value(
371    value: &Value,
372    expected_type: &FieldType,
373) -> Result<SchemaValue, SchemaError> {
374    match value {
375        Value::Null => Ok(SchemaValue::Null),
376        Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
377        Value::Number(n) => match expected_type {
378            FieldType::U64 | FieldType::Any => {
379                if let Some(u) = n.as_u64() {
380                    Ok(SchemaValue::U64(u))
381                } else if let Some(i) = n.as_i64() {
382                    Ok(SchemaValue::I64(i))
383                } else {
384                    Ok(SchemaValue::F64(n.as_f64().unwrap()))
385                }
386            }
387            FieldType::I64 => {
388                if let Some(i) = n.as_i64() {
389                    Ok(SchemaValue::I64(i))
390                } else {
391                    Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
392                }
393            }
394            FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
395            _ => Err(SchemaError::InvalidInput(format!(
396                "Type mismatch: expected {}, but found number.\n\
397                 The field type was inferred or specified as {}, which doesn't accept numeric values.",
398                expected_type.display_name(),
399                expected_type.display_name()
400            ))),
401        },
402        Value::String(s) => Ok(SchemaValue::String(s.clone())),
403        Value::Array(arr) => {
404            let element_type = if let FieldType::Array(et) = expected_type {
405                et.as_ref()
406            } else {
407                return Err(SchemaError::InvalidInput(format!(
408                    "Internal error: Expected array type but found {}. This is a bug in type inference.",
409                    expected_type.display_name()
410                )));
411            };
412
413            let mut schema_values = Vec::new();
414            for item in arr {
415                schema_values.push(json_to_schema_value(item, element_type)?);
416            }
417            Ok(SchemaValue::Array(schema_values))
418        }
419        Value::Object(_) => Err(SchemaError::InvalidInput(
420            "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
421                .to_string(),
422        )),
423    }
424}
425
426/// Set a bit in the null bitmap
427fn set_null_bit(bitmap: &mut [u8], index: usize) {
428    let byte_idx = index / 8;
429    let bit_idx = index % 8;
430    bitmap[byte_idx] |= 1 << bit_idx;
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436
437    #[test]
438    fn test_simple_object() {
439        let input = r#"{"id":1,"name":"alice"}"#;
440        let ir = JsonParser::parse(input).unwrap();
441
442        assert_eq!(ir.header.row_count, 1);
443        assert_eq!(ir.header.fields.len(), 2);
444        assert_eq!(ir.values.len(), 2);
445    }
446
447    #[test]
448    fn test_array_of_objects() {
449        let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
450        let ir = JsonParser::parse(input).unwrap();
451
452        assert_eq!(ir.header.row_count, 2);
453        assert_eq!(ir.header.fields.len(), 2);
454        assert_eq!(ir.values.len(), 4);
455    }
456
457    #[test]
458    fn test_nested_object() {
459        let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
460        let ir = JsonParser::parse(input).unwrap();
461
462        assert_eq!(ir.header.row_count, 1);
463        assert_eq!(ir.header.fields.len(), 1);
464        assert_eq!(ir.header.fields[0].name, "user჻profile჻name");
465    }
466
467    #[test]
468    fn test_root_key() {
469        let input = r#"{"users":[{"id":1}]}"#;
470        let ir = JsonParser::parse(input).unwrap();
471
472        assert_eq!(ir.header.root_key, Some("users".to_string()));
473        assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
474    }
475
476    #[test]
477    fn test_all_types() {
478        let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
479        let ir = JsonParser::parse(input).unwrap();
480
481        assert_eq!(ir.header.fields.len(), 6);
482        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
483    }
484
485    #[test]
486    fn test_null_handling() {
487        let input = r#"{"name":"alice","age":null}"#;
488        let ir = JsonParser::parse(input).unwrap();
489
490        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
491
492        // Find which field is "age"
493        let age_idx = ir
494            .header
495            .fields
496            .iter()
497            .position(|f| f.name == "age")
498            .unwrap();
499        assert!(ir.is_null(0, age_idx)); // age field is null
500    }
501
502    #[test]
503    fn test_homogeneous_array() {
504        let input = r#"{"scores":[1,2,3]}"#;
505        let ir = JsonParser::parse(input).unwrap();
506
507        assert_eq!(
508            ir.header.fields[0].field_type,
509            FieldType::Array(Box::new(FieldType::U64))
510        );
511    }
512
513    #[test]
514    fn test_empty_array() {
515        let input = r#"{"items":[]}"#;
516        let ir = JsonParser::parse(input).unwrap();
517
518        assert_eq!(
519            ir.header.fields[0].field_type,
520            FieldType::Array(Box::new(FieldType::Null))
521        );
522    }
523
524    #[test]
525    fn test_deep_nesting() {
526        let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
527        let ir = JsonParser::parse(input).unwrap();
528
529        assert_eq!(ir.header.fields[0].name, "a჻b჻c჻d");
530    }
531
532    #[test]
533    fn test_flatten_object() {
534        let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
535        let flattened = flatten_object(&obj, "");
536
537        assert_eq!(flattened.len(), 1);
538        assert!(flattened.contains_key("a჻b"));
539    }
540
541    #[test]
542    fn test_single_level_nesting() {
543        let input = r#"{"id":"A1","name":"Jim","grade":{"math":60,"physics":66,"chemistry":61}}"#;
544        let ir = JsonParser::parse(input).unwrap();
545
546        assert_eq!(ir.header.row_count, 1);
547        assert_eq!(ir.header.fields.len(), 5);
548
549        // Check field names
550        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
551        assert!(field_names.contains(&"id".to_string()));
552        assert!(field_names.contains(&"name".to_string()));
553        assert!(field_names.contains(&"grade჻math".to_string()));
554        assert!(field_names.contains(&"grade჻physics".to_string()));
555        assert!(field_names.contains(&"grade჻chemistry".to_string()));
556    }
557
558    #[test]
559    fn test_array_of_nested_objects() {
560        let input = r#"{"students":[{"id":"A1","name":"Jim","grade":{"math":60,"physics":66}}]}"#;
561        let ir = JsonParser::parse(input).unwrap();
562
563        assert_eq!(ir.header.row_count, 1);
564        assert_eq!(ir.header.root_key, Some("students".to_string()));
565
566        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
567        assert!(field_names.contains(&"id".to_string()));
568        assert!(field_names.contains(&"name".to_string()));
569        assert!(field_names.contains(&"grade჻math".to_string()));
570        assert!(field_names.contains(&"grade჻physics".to_string()));
571    }
572
573    #[test]
574    fn test_multiple_nested_levels() {
575        let input = r#"{"data":{"user":{"profile":{"address":{"city":"Boston"}}}}}"#;
576        let ir = JsonParser::parse(input).unwrap();
577
578        assert_eq!(ir.header.fields.len(), 1);
579        assert_eq!(ir.header.fields[0].name, "data჻user჻profile჻address჻city");
580    }
581
582    #[test]
583    fn test_mixed_arrays_and_objects() {
584        let input =
585            r#"{"person":{"name":"Alice","tags":["admin","user"],"address":{"city":"NYC"}}}"#;
586        let ir = JsonParser::parse(input).unwrap();
587
588        let field_names: Vec<String> = ir.header.fields.iter().map(|f| f.name.clone()).collect();
589        assert!(field_names.contains(&"person჻name".to_string()));
590        assert!(field_names.contains(&"person჻tags".to_string()));
591        assert!(field_names.contains(&"person჻address჻city".to_string()));
592
593        // Verify tags is an array type
594        let tags_field = ir
595            .header
596            .fields
597            .iter()
598            .find(|f| f.name == "person჻tags")
599            .unwrap();
600        assert!(matches!(tags_field.field_type, FieldType::Array(_)));
601    }
602}