use serde_json::{Map, Value, json};
use std::collections::HashSet;
pub fn infer_schema(records: &[Value]) -> Value {
let objects: Vec<&Map<String, Value>> = records.iter().filter_map(|r| r.as_object()).collect();
if objects.is_empty() {
return json!({"type": "object", "properties": {}});
}
let all_keys: HashSet<&String> = objects.iter().flat_map(|o| o.keys()).collect();
let mut properties = Map::new();
for key in all_keys {
let values: Vec<&Value> = objects.iter().filter_map(|o| o.get(key)).collect();
let records_with_key = values.len();
let mut field_schema = values
.into_iter()
.map(infer_value_schema)
.reduce(merge_schemas)
.unwrap_or_else(|| json!({}));
if records_with_key < objects.len() {
add_null_type(&mut field_schema);
}
properties.insert(key.clone(), field_schema);
}
json!({
"type": "object",
"properties": Value::Object(properties)
})
}
fn infer_value_schema(v: &Value) -> Value {
match v {
Value::Null => json!({"type": "null"}),
Value::Bool(_) => json!({"type": "boolean"}),
Value::Number(n) => {
if n.is_i64() || n.is_u64() {
json!({"type": "integer"})
} else {
json!({"type": "number"})
}
}
Value::String(_) => json!({"type": "string"}),
Value::Array(arr) => {
let items = if arr.is_empty() {
json!({})
} else {
arr.iter()
.map(infer_value_schema)
.reduce(merge_schemas)
.unwrap_or_else(|| json!({}))
};
json!({"type": "array", "items": items})
}
Value::Object(map) => {
let props: Map<String, Value> = map
.iter()
.map(|(k, v)| (k.clone(), infer_value_schema(v)))
.collect();
json!({"type": "object", "properties": Value::Object(props)})
}
}
}
fn merge_schemas(a: Value, b: Value) -> Value {
let mut types = collect_types(&a)
.union(&collect_types(&b))
.cloned()
.collect::<Vec<_>>();
if types.contains(&"integer".to_string()) && types.contains(&"number".to_string()) {
types.retain(|t| t != "integer");
}
types.sort();
types.dedup();
if types.contains(&"object".to_string()) {
let props = merge_properties(extract_properties(&a), extract_properties(&b));
return json!({
"type": make_type_value(types),
"properties": Value::Object(props)
});
}
if types == ["array"] {
let items_a = a.get("items").cloned().unwrap_or_else(|| json!({}));
let items_b = b.get("items").cloned().unwrap_or_else(|| json!({}));
return json!({
"type": "array",
"items": merge_schemas(items_a, items_b)
});
}
json!({"type": make_type_value(types)})
}
fn merge_properties(a: Map<String, Value>, b: Map<String, Value>) -> Map<String, Value> {
let keys_a: HashSet<String> = a.keys().cloned().collect();
let keys_b: HashSet<String> = b.keys().cloned().collect();
let mut result = Map::new();
for key in keys_a.intersection(&keys_b) {
result.insert(key.clone(), merge_schemas(a[key].clone(), b[key].clone()));
}
for key in keys_a.difference(&keys_b) {
let mut s = a[key].clone();
add_null_type(&mut s);
result.insert(key.clone(), s);
}
for key in keys_b.difference(&keys_a) {
let mut s = b[key].clone();
add_null_type(&mut s);
result.insert(key.clone(), s);
}
result
}
fn collect_types(schema: &Value) -> HashSet<String> {
match schema.get("type") {
Some(Value::String(t)) => std::iter::once(t.clone()).collect(),
Some(Value::Array(arr)) => arr
.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect(),
_ => HashSet::new(),
}
}
fn extract_properties(schema: &Value) -> Map<String, Value> {
schema
.get("properties")
.and_then(|p| p.as_object())
.cloned()
.unwrap_or_default()
}
fn add_null_type(schema: &mut Value) {
let mut types = collect_types(schema);
if types.contains("null") {
return;
}
types.insert("null".to_string());
let new_type = make_type_value(types.into_iter().collect());
if let Some(t) = schema.get_mut("type") {
*t = new_type;
}
}
fn make_type_value(mut types: Vec<String>) -> Value {
types.sort();
types.dedup();
if types.len() == 1 {
Value::String(types.remove(0))
} else {
Value::Array(types.into_iter().map(Value::String).collect())
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_infer_schema_basic_types() {
let records = vec![json!({"id": 1, "name": "Alice", "score": 9.5, "active": true})];
let schema = infer_schema(&records);
let props = &schema["properties"];
assert_eq!(props["id"]["type"], "integer");
assert_eq!(props["name"]["type"], "string");
assert_eq!(props["score"]["type"], "number");
assert_eq!(props["active"]["type"], "boolean");
}
#[test]
fn test_infer_schema_nullable_absent_field() {
let records = vec![json!({"id": 1, "email": "a@example.com"}), json!({"id": 2})];
let schema = infer_schema(&records);
let props = &schema["properties"];
assert_eq!(props["id"]["type"], "integer");
let email_type = &props["email"]["type"];
assert!(
email_type == &json!(["null", "string"]) || email_type == &json!(["string", "null"]),
"expected nullable string, got {email_type}"
);
}
#[test]
fn test_infer_schema_explicit_null_value() {
let records = vec![json!({"tag": "foo"}), json!({"tag": null})];
let schema = infer_schema(&records);
let tag_type = &schema["properties"]["tag"]["type"];
assert!(
tag_type == &json!(["null", "string"]) || tag_type == &json!(["string", "null"]),
"expected nullable string, got {tag_type}"
);
}
#[test]
fn test_infer_schema_integer_widens_to_number() {
let records = vec![json!({"val": 42}), json!({"val": 3.15})];
let schema = infer_schema(&records);
assert_eq!(schema["properties"]["val"]["type"], "number");
}
#[test]
fn test_infer_schema_array_field() {
let records = vec![json!({"tags": ["rust", "api"]})];
let schema = infer_schema(&records);
assert_eq!(schema["properties"]["tags"]["type"], "array");
assert_eq!(schema["properties"]["tags"]["items"]["type"], "string");
}
#[test]
fn test_infer_schema_nested_object() {
let records = vec![
json!({"address": {"city": "NYC", "zip": "10001"}}),
json!({"address": {"city": "LA"}}),
];
let schema = infer_schema(&records);
let addr = &schema["properties"]["address"];
assert_eq!(addr["type"], "object");
assert_eq!(addr["properties"]["city"]["type"], "string");
let zip_type = &addr["properties"]["zip"]["type"];
assert!(
zip_type == &json!(["null", "string"]) || zip_type == &json!(["string", "null"]),
"expected nullable string, got {zip_type}"
);
}
#[test]
fn test_infer_schema_empty_records() {
let schema = infer_schema(&[]);
assert_eq!(schema["type"], "object");
assert_eq!(schema["properties"], json!({}));
}
#[test]
fn test_infer_schema_skips_non_objects() {
let records = vec![json!("string"), json!(42), json!({"id": 1})];
let schema = infer_schema(&records);
assert_eq!(schema["properties"]["id"]["type"], "integer");
}
#[test]
fn test_add_null_type_idempotent() {
let mut s = json!({"type": ["null", "string"]});
add_null_type(&mut s);
assert_eq!(s["type"], json!(["null", "string"]));
}
#[test]
fn test_merge_schemas_object_merges_properties() {
let a = json!({"type": "object", "properties": {"x": {"type": "integer"}}});
let b = json!({"type": "object", "properties": {"y": {"type": "string"}}});
let merged = merge_schemas(a, b);
assert_eq!(merged["type"], "object");
let x_type = &merged["properties"]["x"]["type"];
assert!(
x_type == &json!(["integer", "null"]) || x_type == &json!(["null", "integer"]),
"got {x_type}"
);
let y_type = &merged["properties"]["y"]["type"];
assert!(
y_type == &json!(["null", "string"]) || y_type == &json!(["string", "null"]),
"got {y_type}"
);
}
#[test]
fn test_merge_schemas_array_items_merged() {
let a = json!({"type": "array", "items": {"type": "integer"}});
let b = json!({"type": "array", "items": {"type": "string"}});
let merged = merge_schemas(a, b);
assert_eq!(merged["type"], "array");
let items_type = &merged["items"]["type"];
assert!(
items_type == &json!(["integer", "string"])
|| items_type == &json!(["string", "integer"]),
"got {items_type}"
);
}
}