Skip to main content

faucet_core/
schema.rs

1//! JSON Schema inference from record samples.
2//!
3//! Given a slice of JSON values (records from a REST API), produces a JSON Schema
4//! that is valid for all of them.  The algorithm:
5//!
6//! * Each field type is inferred independently per record then **merged** across records.
7//! * A field absent from some records gets `"null"` added to its type.
8//! * `"integer"` widens to `"number"` when the same field is an integer in some records
9//!   and a float in others.
10//! * Nested objects are recursively inferred and merged.
11
12use serde_json::{Map, Value, json};
13use std::collections::HashSet;
14
15/// Infer a JSON Schema `object` descriptor from a slice of record values.
16///
17/// Non-object top-level values are ignored.  Returns an empty-properties
18/// object schema when `records` is empty or contains no objects.
19pub fn infer_schema(records: &[Value]) -> Value {
20    let objects: Vec<&Map<String, Value>> = records.iter().filter_map(|r| r.as_object()).collect();
21
22    if objects.is_empty() {
23        return json!({"type": "object", "properties": {}});
24    }
25
26    // Collect all field names across all records.
27    let all_keys: HashSet<&String> = objects.iter().flat_map(|o| o.keys()).collect();
28
29    let mut properties = Map::new();
30
31    for key in all_keys {
32        let values: Vec<&Value> = objects.iter().filter_map(|o| o.get(key)).collect();
33        let records_with_key = values.len();
34
35        let mut field_schema = values
36            .into_iter()
37            .map(infer_value_schema)
38            .reduce(merge_schemas)
39            .unwrap_or_else(|| json!({}));
40
41        // Fields absent from some records are implicitly nullable.
42        if records_with_key < objects.len() {
43            add_null_type(&mut field_schema);
44        }
45
46        properties.insert(key.clone(), field_schema);
47    }
48
49    json!({
50        "type": "object",
51        "properties": Value::Object(properties)
52    })
53}
54
55// ── Internal helpers ──────────────────────────────────────────────────────────
56
57fn infer_value_schema(v: &Value) -> Value {
58    match v {
59        Value::Null => json!({"type": "null"}),
60        Value::Bool(_) => json!({"type": "boolean"}),
61        Value::Number(n) => {
62            if n.is_i64() || n.is_u64() {
63                json!({"type": "integer"})
64            } else {
65                json!({"type": "number"})
66            }
67        }
68        Value::String(_) => json!({"type": "string"}),
69        Value::Array(arr) => {
70            let items = if arr.is_empty() {
71                json!({})
72            } else {
73                arr.iter()
74                    .map(infer_value_schema)
75                    .reduce(merge_schemas)
76                    .unwrap_or_else(|| json!({}))
77            };
78            json!({"type": "array", "items": items})
79        }
80        Value::Object(map) => {
81            let props: Map<String, Value> = map
82                .iter()
83                .map(|(k, v)| (k.clone(), infer_value_schema(v)))
84                .collect();
85            json!({"type": "object", "properties": Value::Object(props)})
86        }
87    }
88}
89
90/// Merge two schemas into one that is valid for both.
91fn merge_schemas(a: Value, b: Value) -> Value {
92    let mut types = collect_types(&a)
93        .union(&collect_types(&b))
94        .cloned()
95        .collect::<Vec<_>>();
96
97    // Numeric widening: integer + number → number.
98    if types.contains(&"integer".to_string()) && types.contains(&"number".to_string()) {
99        types.retain(|t| t != "integer");
100    }
101    types.sort();
102    types.dedup();
103
104    // Build the merged schema, preserving *both* `properties` (when the union
105    // includes `object`) and `items` (when it includes `array`). The previous
106    // implementation returned early on `object` and dropped any array `items`,
107    // so a field that was an array in some records and an object in others
108    // lost its element shape (#78/#35).
109    // Two untyped fragments carry no information — return the unknown schema
110    // `{}` rather than a malformed `{"type": []}` (#78/#35).
111    if types.is_empty() {
112        return json!({});
113    }
114
115    let has_object = types.iter().any(|t| t == "object");
116    let has_array = types.iter().any(|t| t == "array");
117
118    let mut result = Map::new();
119    result.insert("type".to_string(), make_type_value(types));
120
121    if has_object {
122        let props = merge_properties(extract_properties(&a), extract_properties(&b));
123        result.insert("properties".to_string(), Value::Object(props));
124    }
125
126    if has_array {
127        // Merge the element schemas from whichever side(s) carried `items`.
128        // Omit `items` entirely when the element type is genuinely unknown
129        // (e.g. only empty arrays were seen) rather than emitting `items: {}`.
130        let items = match (a.get("items").cloned(), b.get("items").cloned()) {
131            (Some(x), Some(y)) => Some(merge_schemas(x, y)),
132            (Some(x), None) | (None, Some(x)) => Some(x),
133            (None, None) => None,
134        };
135        if let Some(items) = items
136            && !is_unknown_schema(&items)
137        {
138            result.insert("items".to_string(), items);
139        }
140    }
141
142    Value::Object(result)
143}
144
145/// A schema carrying no information — `{}` or `null`. Used to decide whether
146/// an array's `items` is worth emitting.
147fn is_unknown_schema(schema: &Value) -> bool {
148    match schema {
149        Value::Object(m) => m.is_empty(),
150        Value::Null => true,
151        _ => false,
152    }
153}
154
155fn merge_properties(a: Map<String, Value>, b: Map<String, Value>) -> Map<String, Value> {
156    let keys_a: HashSet<String> = a.keys().cloned().collect();
157    let keys_b: HashSet<String> = b.keys().cloned().collect();
158    let mut result = Map::new();
159
160    // Keys in both: merge.
161    for key in keys_a.intersection(&keys_b) {
162        result.insert(key.clone(), merge_schemas(a[key].clone(), b[key].clone()));
163    }
164    // Keys only in A: field can be absent → nullable.
165    for key in keys_a.difference(&keys_b) {
166        let mut s = a[key].clone();
167        add_null_type(&mut s);
168        result.insert(key.clone(), s);
169    }
170    // Keys only in B: field can be absent → nullable.
171    for key in keys_b.difference(&keys_a) {
172        let mut s = b[key].clone();
173        add_null_type(&mut s);
174        result.insert(key.clone(), s);
175    }
176
177    result
178}
179
180fn collect_types(schema: &Value) -> HashSet<String> {
181    match schema.get("type") {
182        Some(Value::String(t)) => std::iter::once(t.clone()).collect(),
183        Some(Value::Array(arr)) => arr
184            .iter()
185            .filter_map(|v| v.as_str().map(String::from))
186            .collect(),
187        _ => HashSet::new(),
188    }
189}
190
191fn extract_properties(schema: &Value) -> Map<String, Value> {
192    schema
193        .get("properties")
194        .and_then(|p| p.as_object())
195        .cloned()
196        .unwrap_or_default()
197}
198
199/// Add `"null"` to the type of `schema` if not already present.
200fn add_null_type(schema: &mut Value) {
201    let mut types = collect_types(schema);
202    if types.contains("null") {
203        return;
204    }
205    types.insert("null".to_string());
206    let new_type = make_type_value(types.into_iter().collect());
207    match schema {
208        // Insert (not just overwrite) so a type-less `{}` fragment is still
209        // marked nullable instead of silently dropping the null.
210        Value::Object(map) => {
211            map.insert("type".to_string(), new_type);
212        }
213        // A non-object fragment (e.g. a bare `null`) becomes a minimal
214        // nullable object schema.
215        _ => *schema = json!({ "type": new_type }),
216    }
217}
218
219fn make_type_value(mut types: Vec<String>) -> Value {
220    types.sort();
221    types.dedup();
222    if types.len() == 1 {
223        Value::String(types.remove(0))
224    } else {
225        Value::Array(types.into_iter().map(Value::String).collect())
226    }
227}
228
229// ── Tests ─────────────────────────────────────────────────────────────────────
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use serde_json::json;
235
236    #[test]
237    fn add_null_type_marks_typeless_schema_nullable() {
238        // A type-less `{}` fragment (an unknown field) must still be marked
239        // nullable when absent from some records — previously the missing
240        // `type` key meant the null was silently dropped.
241        let mut s = json!({});
242        add_null_type(&mut s);
243        assert_eq!(s, json!({"type": "null"}));
244    }
245
246    #[test]
247    fn add_null_type_adds_null_to_existing_type() {
248        let mut s = json!({"type": "string"});
249        add_null_type(&mut s);
250        assert_eq!(s["type"], json!(["null", "string"]));
251    }
252
253    #[test]
254    fn add_null_type_is_idempotent_when_already_nullable() {
255        let mut s = json!({"type": ["null", "string"]});
256        add_null_type(&mut s);
257        assert_eq!(s["type"], json!(["null", "string"]));
258    }
259
260    #[test]
261    fn test_infer_schema_basic_types() {
262        let records = vec![json!({"id": 1, "name": "Alice", "score": 9.5, "active": true})];
263        let schema = infer_schema(&records);
264        let props = &schema["properties"];
265        assert_eq!(props["id"]["type"], "integer");
266        assert_eq!(props["name"]["type"], "string");
267        assert_eq!(props["score"]["type"], "number");
268        assert_eq!(props["active"]["type"], "boolean");
269    }
270
271    #[test]
272    fn test_infer_schema_nullable_absent_field() {
273        let records = vec![json!({"id": 1, "email": "a@example.com"}), json!({"id": 2})];
274        let schema = infer_schema(&records);
275        let props = &schema["properties"];
276        assert_eq!(props["id"]["type"], "integer");
277        // email is absent in second record → nullable
278        let email_type = &props["email"]["type"];
279        assert!(
280            email_type == &json!(["null", "string"]) || email_type == &json!(["string", "null"]),
281            "expected nullable string, got {email_type}"
282        );
283    }
284
285    #[test]
286    fn test_infer_schema_explicit_null_value() {
287        let records = vec![json!({"tag": "foo"}), json!({"tag": null})];
288        let schema = infer_schema(&records);
289        let tag_type = &schema["properties"]["tag"]["type"];
290        assert!(
291            tag_type == &json!(["null", "string"]) || tag_type == &json!(["string", "null"]),
292            "expected nullable string, got {tag_type}"
293        );
294    }
295
296    #[test]
297    fn test_infer_schema_integer_widens_to_number() {
298        let records = vec![json!({"val": 42}), json!({"val": 3.15})];
299        let schema = infer_schema(&records);
300        assert_eq!(schema["properties"]["val"]["type"], "number");
301    }
302
303    #[test]
304    fn test_infer_schema_array_field() {
305        let records = vec![json!({"tags": ["rust", "api"]})];
306        let schema = infer_schema(&records);
307        assert_eq!(schema["properties"]["tags"]["type"], "array");
308        assert_eq!(schema["properties"]["tags"]["items"]["type"], "string");
309    }
310
311    #[test]
312    fn test_infer_schema_nested_object() {
313        let records = vec![
314            json!({"address": {"city": "NYC", "zip": "10001"}}),
315            json!({"address": {"city": "LA"}}),
316        ];
317        let schema = infer_schema(&records);
318        let addr = &schema["properties"]["address"];
319        assert_eq!(addr["type"], "object");
320        assert_eq!(addr["properties"]["city"]["type"], "string");
321        // zip absent from second record → nullable
322        let zip_type = &addr["properties"]["zip"]["type"];
323        assert!(
324            zip_type == &json!(["null", "string"]) || zip_type == &json!(["string", "null"]),
325            "expected nullable string, got {zip_type}"
326        );
327    }
328
329    #[test]
330    fn test_infer_schema_empty_records() {
331        let schema = infer_schema(&[]);
332        assert_eq!(schema["type"], "object");
333        assert_eq!(schema["properties"], json!({}));
334    }
335
336    #[test]
337    fn test_infer_schema_skips_non_objects() {
338        // Top-level arrays and primitives are ignored.
339        let records = vec![json!("string"), json!(42), json!({"id": 1})];
340        let schema = infer_schema(&records);
341        assert_eq!(schema["properties"]["id"]["type"], "integer");
342    }
343
344    #[test]
345    fn test_add_null_type_idempotent() {
346        let mut s = json!({"type": ["null", "string"]});
347        add_null_type(&mut s);
348        // Should not duplicate "null".
349        assert_eq!(s["type"], json!(["null", "string"]));
350    }
351
352    #[test]
353    fn test_merge_schemas_object_merges_properties() {
354        let a = json!({"type": "object", "properties": {"x": {"type": "integer"}}});
355        let b = json!({"type": "object", "properties": {"y": {"type": "string"}}});
356        let merged = merge_schemas(a, b);
357        assert_eq!(merged["type"], "object");
358        // x is absent from b → nullable in merged
359        let x_type = &merged["properties"]["x"]["type"];
360        assert!(
361            x_type == &json!(["integer", "null"]) || x_type == &json!(["null", "integer"]),
362            "got {x_type}"
363        );
364        // y is absent from a → nullable in merged
365        let y_type = &merged["properties"]["y"]["type"];
366        assert!(
367            y_type == &json!(["null", "string"]) || y_type == &json!(["string", "null"]),
368            "got {y_type}"
369        );
370    }
371
372    #[test]
373    fn test_merge_schemas_array_items_merged() {
374        let a = json!({"type": "array", "items": {"type": "integer"}});
375        let b = json!({"type": "array", "items": {"type": "string"}});
376        let merged = merge_schemas(a, b);
377        assert_eq!(merged["type"], "array");
378        let items_type = &merged["items"]["type"];
379        assert!(
380            items_type == &json!(["integer", "string"])
381                || items_type == &json!(["string", "integer"]),
382            "got {items_type}"
383        );
384    }
385
386    #[test]
387    fn test_merge_schemas_array_object_union_preserves_items_and_properties() {
388        // Regression for #78/#35: a field that is an array in some records and
389        // an object in others must keep *both* the array `items` and the
390        // object `properties`, not silently drop the array shape.
391        let arr = json!({"type": "array", "items": {"type": "integer"}});
392        let obj = json!({"type": "object", "properties": {"k": {"type": "string"}}});
393        let merged = merge_schemas(arr, obj);
394        let types = &merged["type"];
395        assert!(
396            types == &json!(["array", "object"]) || types == &json!(["object", "array"]),
397            "got {types}"
398        );
399        assert_eq!(merged["items"]["type"], "integer", "array items dropped");
400        // `k` is present only on the object variant → nullable in the union.
401        let k_type = &merged["properties"]["k"]["type"];
402        assert!(
403            k_type == &json!(["null", "string"]) || k_type == &json!(["string", "null"]),
404            "got {k_type}"
405        );
406    }
407
408    #[test]
409    fn test_merge_schemas_unknown_array_items_omitted() {
410        // Two empty arrays carry no element info → omit `items` rather than
411        // emitting a meaningless `items: {}` (#78/#35).
412        let a = json!({"type": "array", "items": {}});
413        let b = json!({"type": "array", "items": {}});
414        let merged = merge_schemas(a, b);
415        assert_eq!(merged["type"], "array");
416        assert!(merged.get("items").is_none(), "got {merged}");
417    }
418}