base_d/encoders/algorithms/schema/parsers/
json.rs

1use crate::encoders::algorithms::schema::parsers::InputParser;
2use crate::encoders::algorithms::schema::types::*;
3use serde_json::{Map, Value};
4use std::collections::HashMap;
5
6pub struct JsonParser;
7
8impl InputParser for JsonParser {
9    type Error = SchemaError;
10
11    fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
12        let parsed: Value = serde_json::from_str(input).map_err(|e| {
13            SchemaError::InvalidInput(format!(
14                "Invalid JSON syntax: {}\n\
15                 Ensure the input is valid JSON.",
16                e
17            ))
18        })?;
19
20        match parsed {
21            Value::Array(arr) => parse_array(arr),
22            Value::Object(obj) => parse_object(obj),
23            _ => Err(SchemaError::InvalidInput(
24                "Expected JSON object or array at root level.\n\
25                 Schema encoding works with:\n\
26                 - Single object: {\"name\": \"value\"}\n\
27                 - Array of objects: [{\"id\": 1}, {\"id\": 2}]\n\
28                 - Object with array: {\"users\": [{\"id\": 1}]}"
29                    .to_string(),
30            )),
31        }
32    }
33}
34
35/// Parse array of objects (tabular data)
36fn parse_array(arr: Vec<Value>) -> Result<IntermediateRepresentation, SchemaError> {
37    if arr.is_empty() {
38        return Err(SchemaError::InvalidInput(
39            "Empty array - cannot infer schema from zero rows.\n\
40             Provide at least one object in the array."
41                .to_string(),
42        ));
43    }
44
45    let row_count = arr.len();
46    let mut all_rows: Vec<Map<String, Value>> = Vec::new();
47
48    // Extract objects from array
49    for (idx, item) in arr.into_iter().enumerate() {
50        match item {
51            Value::Object(obj) => all_rows.push(obj),
52            other => {
53                let type_name = match other {
54                    Value::Null => "null",
55                    Value::Bool(_) => "boolean",
56                    Value::Number(_) => "number",
57                    Value::String(_) => "string",
58                    Value::Array(_) => "array",
59                    Value::Object(_) => unreachable!(),
60                };
61                return Err(SchemaError::InvalidInput(format!(
62                    "Array must contain only objects (tabular data). Found {} at index {}.\n\
63                     Schema encoding expects arrays of objects like: [{{\"id\": 1}}, {{\"id\": 2}}]",
64                    type_name, idx
65                )));
66            }
67        }
68    }
69
70    // Flatten all objects and collect field names
71    let mut flattened_rows: Vec<HashMap<String, Value>> = Vec::new();
72    let mut all_field_names = std::collections::BTreeSet::new();
73
74    for obj in &all_rows {
75        let flattened = flatten_object(obj, "");
76        for key in flattened.keys() {
77            all_field_names.insert(key.clone());
78        }
79        flattened_rows.push(flattened);
80    }
81
82    let field_names: Vec<String> = all_field_names.into_iter().collect();
83
84    // Infer types and build fields
85    let mut fields = Vec::new();
86    let mut has_nulls = false;
87
88    for field_name in &field_names {
89        let field_type = infer_field_type(&flattened_rows, field_name, &mut has_nulls)?;
90        fields.push(FieldDef::new(field_name.clone(), field_type));
91    }
92
93    // Build values and null bitmap
94    let mut values = Vec::new();
95    let total_values = row_count * fields.len();
96    let bitmap_bytes = total_values.div_ceil(8);
97    let mut null_bitmap = vec![0u8; bitmap_bytes];
98
99    for (row_idx, row) in flattened_rows.iter().enumerate() {
100        for (field_idx, field) in fields.iter().enumerate() {
101            let value_idx = row_idx * fields.len() + field_idx;
102
103            if let Some(json_value) = row.get(&field.name)
104                && json_value.is_null()
105            {
106                values.push(SchemaValue::Null);
107                set_null_bit(&mut null_bitmap, value_idx);
108                has_nulls = true;
109            } else if let Some(json_value) = row.get(&field.name) {
110                values.push(json_to_schema_value(json_value, &field.field_type)?);
111            } else {
112                // Missing field = null
113                values.push(SchemaValue::Null);
114                set_null_bit(&mut null_bitmap, value_idx);
115                has_nulls = true;
116            }
117        }
118    }
119
120    // Build header
121    let mut header = SchemaHeader::new(row_count, fields);
122    if has_nulls {
123        header.null_bitmap = Some(null_bitmap);
124        header.set_flag(FLAG_HAS_NULLS);
125    }
126
127    IntermediateRepresentation::new(header, values)
128}
129
130/// Parse single object (may have root key)
131fn parse_object(obj: Map<String, Value>) -> Result<IntermediateRepresentation, SchemaError> {
132    // Check if there's a single key containing an array of objects (root key pattern)
133    // e.g. {"users":[{"id":1},{"id":2}]} vs {"scores":[1,2,3]}
134    if obj.len() == 1 {
135        // Check if value is an array of objects before consuming
136        let is_root_key_pattern = obj
137            .values()
138            .next()
139            .map(|v| {
140                if let Value::Array(arr) = v {
141                    // Only treat as root key if array contains objects (tabular data)
142                    !arr.is_empty() && arr.iter().all(|item| matches!(item, Value::Object(_)))
143                } else {
144                    false
145                }
146            })
147            .unwrap_or(false);
148
149        if is_root_key_pattern {
150            // Extract key and value by consuming the map
151            let (key, value) = obj.into_iter().next().unwrap();
152            // We already checked it's an array
153            let arr = match value {
154                Value::Array(a) => a,
155                _ => unreachable!(),
156            };
157
158            // Parse as array with root key
159            let mut ir = parse_array(arr)?;
160            ir.header.root_key = Some(key);
161            ir.header.set_flag(FLAG_HAS_ROOT_KEY);
162            return Ok(ir);
163        }
164    }
165
166    // Single object - treat as single row
167    let flattened = flatten_object(&obj, "");
168    // Preserve field order from original object (serde_json preserves insertion order)
169    let mut field_names = Vec::new();
170    collect_field_names_ordered(&obj, "", &mut field_names);
171
172    let mut fields = Vec::new();
173    let mut has_nulls = false;
174
175    for field_name in &field_names {
176        let value = &flattened[field_name];
177        let field_type = infer_type(value);
178        if value.is_null() {
179            has_nulls = true;
180        }
181        fields.push(FieldDef::new(field_name.clone(), field_type));
182    }
183
184    // Build values and null bitmap
185    let mut values = Vec::new();
186    let total_values = fields.len();
187    let bitmap_bytes = total_values.div_ceil(8);
188    let mut null_bitmap = vec![0u8; bitmap_bytes];
189
190    for (field_idx, field) in fields.iter().enumerate() {
191        let json_value = &flattened[&field.name];
192        if json_value.is_null() {
193            values.push(SchemaValue::Null);
194            set_null_bit(&mut null_bitmap, field_idx);
195        } else {
196            values.push(json_to_schema_value(json_value, &field.field_type)?);
197        }
198    }
199
200    // Build header
201    let mut header = SchemaHeader::new(1, fields);
202    if has_nulls {
203        header.null_bitmap = Some(null_bitmap);
204        header.set_flag(FLAG_HAS_NULLS);
205    }
206
207    IntermediateRepresentation::new(header, values)
208}
209
210/// Collect field names in order from nested object
211fn collect_field_names_ordered(obj: &Map<String, Value>, prefix: &str, names: &mut Vec<String>) {
212    for (key, value) in obj {
213        let full_key = if prefix.is_empty() {
214            key.clone()
215        } else {
216            format!("{}.{}", prefix, key)
217        };
218
219        match value {
220            Value::Object(nested) => {
221                collect_field_names_ordered(nested, &full_key, names);
222            }
223            _ => {
224                names.push(full_key);
225            }
226        }
227    }
228}
229
230/// Flatten nested object into dotted keys
231fn flatten_object(obj: &Map<String, Value>, prefix: &str) -> HashMap<String, Value> {
232    let mut result = HashMap::new();
233
234    for (key, value) in obj {
235        let full_key = if prefix.is_empty() {
236            key.clone()
237        } else {
238            format!("{}.{}", prefix, key)
239        };
240
241        match value {
242            Value::Object(nested) => {
243                result.extend(flatten_object(nested, &full_key));
244            }
245            _ => {
246                result.insert(full_key, value.clone());
247            }
248        }
249    }
250
251    result
252}
253
254/// Infer type from a single JSON value
255fn infer_type(value: &Value) -> FieldType {
256    match value {
257        Value::Null => FieldType::Null,
258        Value::Bool(_) => FieldType::Bool,
259        Value::Number(n) => {
260            if n.is_f64() {
261                // Check if it has a fractional part
262                if let Some(f) = n.as_f64()
263                    && (f.fract() != 0.0 || f.is_infinite() || f.is_nan())
264                {
265                    return FieldType::F64;
266                }
267            }
268
269            if let Some(i) = n.as_i64() {
270                if i < 0 {
271                    FieldType::I64
272                } else {
273                    FieldType::U64
274                }
275            } else if n.as_u64().is_some() {
276                FieldType::U64
277            } else {
278                FieldType::F64
279            }
280        }
281        Value::String(_) => FieldType::String,
282        Value::Array(arr) => {
283            if arr.is_empty() {
284                FieldType::Array(Box::new(FieldType::Null))
285            } else {
286                // Infer from first non-null element
287                let element_type = arr
288                    .iter()
289                    .find(|v| !v.is_null())
290                    .map(infer_type)
291                    .unwrap_or(FieldType::Null);
292                FieldType::Array(Box::new(element_type))
293            }
294        }
295        Value::Object(_) => {
296            // This shouldn't happen after flattening
297            FieldType::String
298        }
299    }
300}
301
302/// Infer field type across multiple rows
303fn infer_field_type(
304    rows: &[HashMap<String, Value>],
305    field_name: &str,
306    has_nulls: &mut bool,
307) -> Result<FieldType, SchemaError> {
308    let mut inferred_type: Option<FieldType> = None;
309
310    for row in rows {
311        if let Some(value) = row.get(field_name) {
312            if value.is_null() {
313                *has_nulls = true;
314                continue;
315            }
316
317            let current_type = infer_type(value);
318
319            if let Some(ref existing_type) = inferred_type {
320                if *existing_type != current_type {
321                    // Type conflict - use Any
322                    return Ok(FieldType::Any);
323                }
324            } else {
325                inferred_type = Some(current_type);
326            }
327        } else {
328            *has_nulls = true;
329        }
330    }
331
332    Ok(inferred_type.unwrap_or(FieldType::Null))
333}
334
335/// Convert JSON value to SchemaValue
336fn json_to_schema_value(
337    value: &Value,
338    expected_type: &FieldType,
339) -> Result<SchemaValue, SchemaError> {
340    match value {
341        Value::Null => Ok(SchemaValue::Null),
342        Value::Bool(b) => Ok(SchemaValue::Bool(*b)),
343        Value::Number(n) => match expected_type {
344            FieldType::U64 | FieldType::Any => {
345                if let Some(u) = n.as_u64() {
346                    Ok(SchemaValue::U64(u))
347                } else if let Some(i) = n.as_i64() {
348                    Ok(SchemaValue::I64(i))
349                } else {
350                    Ok(SchemaValue::F64(n.as_f64().unwrap()))
351                }
352            }
353            FieldType::I64 => {
354                if let Some(i) = n.as_i64() {
355                    Ok(SchemaValue::I64(i))
356                } else {
357                    Ok(SchemaValue::I64(n.as_f64().unwrap() as i64))
358                }
359            }
360            FieldType::F64 => Ok(SchemaValue::F64(n.as_f64().unwrap())),
361            _ => Err(SchemaError::InvalidInput(format!(
362                "Type mismatch: expected {}, but found number.\n\
363                 The field type was inferred or specified as {}, which doesn't accept numeric values.",
364                expected_type.display_name(),
365                expected_type.display_name()
366            ))),
367        },
368        Value::String(s) => Ok(SchemaValue::String(s.clone())),
369        Value::Array(arr) => {
370            let element_type = if let FieldType::Array(et) = expected_type {
371                et.as_ref()
372            } else {
373                return Err(SchemaError::InvalidInput(format!(
374                    "Internal error: Expected array type but found {}. This is a bug in type inference.",
375                    expected_type.display_name()
376                )));
377            };
378
379            let mut schema_values = Vec::new();
380            for item in arr {
381                schema_values.push(json_to_schema_value(item, element_type)?);
382            }
383            Ok(SchemaValue::Array(schema_values))
384        }
385        Value::Object(_) => Err(SchemaError::InvalidInput(
386            "Internal error: Encountered nested object that wasn't flattened. This is a bug in the JSON parser."
387                .to_string(),
388        )),
389    }
390}
391
392/// Set a bit in the null bitmap
393fn set_null_bit(bitmap: &mut [u8], index: usize) {
394    let byte_idx = index / 8;
395    let bit_idx = index % 8;
396    bitmap[byte_idx] |= 1 << bit_idx;
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_simple_object() {
405        let input = r#"{"id":1,"name":"alice"}"#;
406        let ir = JsonParser::parse(input).unwrap();
407
408        assert_eq!(ir.header.row_count, 1);
409        assert_eq!(ir.header.fields.len(), 2);
410        assert_eq!(ir.values.len(), 2);
411    }
412
413    #[test]
414    fn test_array_of_objects() {
415        let input = r#"[{"id":1,"name":"alice"},{"id":2,"name":"bob"}]"#;
416        let ir = JsonParser::parse(input).unwrap();
417
418        assert_eq!(ir.header.row_count, 2);
419        assert_eq!(ir.header.fields.len(), 2);
420        assert_eq!(ir.values.len(), 4);
421    }
422
423    #[test]
424    fn test_nested_object() {
425        let input = r#"{"user":{"profile":{"name":"alice"}}}"#;
426        let ir = JsonParser::parse(input).unwrap();
427
428        assert_eq!(ir.header.row_count, 1);
429        assert_eq!(ir.header.fields.len(), 1);
430        assert_eq!(ir.header.fields[0].name, "user.profile.name");
431    }
432
433    #[test]
434    fn test_root_key() {
435        let input = r#"{"users":[{"id":1}]}"#;
436        let ir = JsonParser::parse(input).unwrap();
437
438        assert_eq!(ir.header.root_key, Some("users".to_string()));
439        assert!(ir.header.has_flag(FLAG_HAS_ROOT_KEY));
440    }
441
442    #[test]
443    fn test_all_types() {
444        let input = r#"{"u":1,"i":-1,"f":3.14,"s":"test","b":true,"n":null}"#;
445        let ir = JsonParser::parse(input).unwrap();
446
447        assert_eq!(ir.header.fields.len(), 6);
448        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
449    }
450
451    #[test]
452    fn test_null_handling() {
453        let input = r#"{"name":"alice","age":null}"#;
454        let ir = JsonParser::parse(input).unwrap();
455
456        assert!(ir.header.has_flag(FLAG_HAS_NULLS));
457
458        // Find which field is "age"
459        let age_idx = ir
460            .header
461            .fields
462            .iter()
463            .position(|f| f.name == "age")
464            .unwrap();
465        assert!(ir.is_null(0, age_idx)); // age field is null
466    }
467
468    #[test]
469    fn test_homogeneous_array() {
470        let input = r#"{"scores":[1,2,3]}"#;
471        let ir = JsonParser::parse(input).unwrap();
472
473        assert_eq!(
474            ir.header.fields[0].field_type,
475            FieldType::Array(Box::new(FieldType::U64))
476        );
477    }
478
479    #[test]
480    fn test_empty_array() {
481        let input = r#"{"items":[]}"#;
482        let ir = JsonParser::parse(input).unwrap();
483
484        assert_eq!(
485            ir.header.fields[0].field_type,
486            FieldType::Array(Box::new(FieldType::Null))
487        );
488    }
489
490    #[test]
491    fn test_deep_nesting() {
492        let input = r#"{"a":{"b":{"c":{"d":1}}}}"#;
493        let ir = JsonParser::parse(input).unwrap();
494
495        assert_eq!(ir.header.fields[0].name, "a.b.c.d");
496    }
497
498    #[test]
499    fn test_flatten_object() {
500        let obj: Map<String, Value> = serde_json::from_str(r#"{"a":{"b":1}}"#).unwrap();
501        let flattened = flatten_object(&obj, "");
502
503        assert_eq!(flattened.len(), 1);
504        assert!(flattened.contains_key("a.b"));
505    }
506}