1use serde_json::{Map, Value, json};
13use std::collections::HashSet;
14
15pub fn infer_schema(records: &[Value]) -> Value {
20 let objects: Vec<&Map<String, Value>> = records.iter().filter_map(|r| r.as_object()).collect();
21
22 if objects.is_empty() {
23 return json!({"type": "object", "properties": {}});
24 }
25
26 let all_keys: HashSet<&String> = objects.iter().flat_map(|o| o.keys()).collect();
28
29 let mut properties = Map::new();
30
31 for key in all_keys {
32 let values: Vec<&Value> = objects.iter().filter_map(|o| o.get(key)).collect();
33 let records_with_key = values.len();
34
35 let mut field_schema = values
36 .into_iter()
37 .map(infer_value_schema)
38 .reduce(merge_schemas)
39 .unwrap_or_else(|| json!({}));
40
41 if records_with_key < objects.len() {
43 add_null_type(&mut field_schema);
44 }
45
46 properties.insert(key.clone(), field_schema);
47 }
48
49 json!({
50 "type": "object",
51 "properties": Value::Object(properties)
52 })
53}
54
55fn infer_value_schema(v: &Value) -> Value {
58 match v {
59 Value::Null => json!({"type": "null"}),
60 Value::Bool(_) => json!({"type": "boolean"}),
61 Value::Number(n) => {
62 if n.is_i64() || n.is_u64() {
63 json!({"type": "integer"})
64 } else {
65 json!({"type": "number"})
66 }
67 }
68 Value::String(_) => json!({"type": "string"}),
69 Value::Array(arr) => {
70 let items = if arr.is_empty() {
71 json!({})
72 } else {
73 arr.iter()
74 .map(infer_value_schema)
75 .reduce(merge_schemas)
76 .unwrap_or_else(|| json!({}))
77 };
78 json!({"type": "array", "items": items})
79 }
80 Value::Object(map) => {
81 let props: Map<String, Value> = map
82 .iter()
83 .map(|(k, v)| (k.clone(), infer_value_schema(v)))
84 .collect();
85 json!({"type": "object", "properties": Value::Object(props)})
86 }
87 }
88}
89
90fn merge_schemas(a: Value, b: Value) -> Value {
92 let mut types = collect_types(&a)
93 .union(&collect_types(&b))
94 .cloned()
95 .collect::<Vec<_>>();
96
97 if types.contains(&"integer".to_string()) && types.contains(&"number".to_string()) {
99 types.retain(|t| t != "integer");
100 }
101 types.sort();
102 types.dedup();
103
104 if types.is_empty() {
112 return json!({});
113 }
114
115 let has_object = types.iter().any(|t| t == "object");
116 let has_array = types.iter().any(|t| t == "array");
117
118 let mut result = Map::new();
119 result.insert("type".to_string(), make_type_value(types));
120
121 if has_object {
122 let props = merge_properties(extract_properties(&a), extract_properties(&b));
123 result.insert("properties".to_string(), Value::Object(props));
124 }
125
126 if has_array {
127 let items = match (a.get("items").cloned(), b.get("items").cloned()) {
131 (Some(x), Some(y)) => Some(merge_schemas(x, y)),
132 (Some(x), None) | (None, Some(x)) => Some(x),
133 (None, None) => None,
134 };
135 if let Some(items) = items
136 && !is_unknown_schema(&items)
137 {
138 result.insert("items".to_string(), items);
139 }
140 }
141
142 Value::Object(result)
143}
144
145fn is_unknown_schema(schema: &Value) -> bool {
148 match schema {
149 Value::Object(m) => m.is_empty(),
150 Value::Null => true,
151 _ => false,
152 }
153}
154
155fn merge_properties(a: Map<String, Value>, b: Map<String, Value>) -> Map<String, Value> {
156 let keys_a: HashSet<String> = a.keys().cloned().collect();
157 let keys_b: HashSet<String> = b.keys().cloned().collect();
158 let mut result = Map::new();
159
160 for key in keys_a.intersection(&keys_b) {
162 result.insert(key.clone(), merge_schemas(a[key].clone(), b[key].clone()));
163 }
164 for key in keys_a.difference(&keys_b) {
166 let mut s = a[key].clone();
167 add_null_type(&mut s);
168 result.insert(key.clone(), s);
169 }
170 for key in keys_b.difference(&keys_a) {
172 let mut s = b[key].clone();
173 add_null_type(&mut s);
174 result.insert(key.clone(), s);
175 }
176
177 result
178}
179
180fn collect_types(schema: &Value) -> HashSet<String> {
181 match schema.get("type") {
182 Some(Value::String(t)) => std::iter::once(t.clone()).collect(),
183 Some(Value::Array(arr)) => arr
184 .iter()
185 .filter_map(|v| v.as_str().map(String::from))
186 .collect(),
187 _ => HashSet::new(),
188 }
189}
190
191fn extract_properties(schema: &Value) -> Map<String, Value> {
192 schema
193 .get("properties")
194 .and_then(|p| p.as_object())
195 .cloned()
196 .unwrap_or_default()
197}
198
199fn add_null_type(schema: &mut Value) {
201 let mut types = collect_types(schema);
202 if types.contains("null") {
203 return;
204 }
205 types.insert("null".to_string());
206 let new_type = make_type_value(types.into_iter().collect());
207 match schema {
208 Value::Object(map) => {
211 map.insert("type".to_string(), new_type);
212 }
213 _ => *schema = json!({ "type": new_type }),
216 }
217}
218
219fn make_type_value(mut types: Vec<String>) -> Value {
220 types.sort();
221 types.dedup();
222 if types.len() == 1 {
223 Value::String(types.remove(0))
224 } else {
225 Value::Array(types.into_iter().map(Value::String).collect())
226 }
227}
228
229#[cfg(test)]
232mod tests {
233 use super::*;
234 use serde_json::json;
235
236 #[test]
237 fn add_null_type_marks_typeless_schema_nullable() {
238 let mut s = json!({});
242 add_null_type(&mut s);
243 assert_eq!(s, json!({"type": "null"}));
244 }
245
246 #[test]
247 fn add_null_type_adds_null_to_existing_type() {
248 let mut s = json!({"type": "string"});
249 add_null_type(&mut s);
250 assert_eq!(s["type"], json!(["null", "string"]));
251 }
252
253 #[test]
254 fn add_null_type_is_idempotent_when_already_nullable() {
255 let mut s = json!({"type": ["null", "string"]});
256 add_null_type(&mut s);
257 assert_eq!(s["type"], json!(["null", "string"]));
258 }
259
260 #[test]
261 fn test_infer_schema_basic_types() {
262 let records = vec![json!({"id": 1, "name": "Alice", "score": 9.5, "active": true})];
263 let schema = infer_schema(&records);
264 let props = &schema["properties"];
265 assert_eq!(props["id"]["type"], "integer");
266 assert_eq!(props["name"]["type"], "string");
267 assert_eq!(props["score"]["type"], "number");
268 assert_eq!(props["active"]["type"], "boolean");
269 }
270
271 #[test]
272 fn test_infer_schema_nullable_absent_field() {
273 let records = vec![json!({"id": 1, "email": "a@example.com"}), json!({"id": 2})];
274 let schema = infer_schema(&records);
275 let props = &schema["properties"];
276 assert_eq!(props["id"]["type"], "integer");
277 let email_type = &props["email"]["type"];
279 assert!(
280 email_type == &json!(["null", "string"]) || email_type == &json!(["string", "null"]),
281 "expected nullable string, got {email_type}"
282 );
283 }
284
285 #[test]
286 fn test_infer_schema_explicit_null_value() {
287 let records = vec![json!({"tag": "foo"}), json!({"tag": null})];
288 let schema = infer_schema(&records);
289 let tag_type = &schema["properties"]["tag"]["type"];
290 assert!(
291 tag_type == &json!(["null", "string"]) || tag_type == &json!(["string", "null"]),
292 "expected nullable string, got {tag_type}"
293 );
294 }
295
296 #[test]
297 fn test_infer_schema_integer_widens_to_number() {
298 let records = vec![json!({"val": 42}), json!({"val": 3.15})];
299 let schema = infer_schema(&records);
300 assert_eq!(schema["properties"]["val"]["type"], "number");
301 }
302
303 #[test]
304 fn test_infer_schema_array_field() {
305 let records = vec![json!({"tags": ["rust", "api"]})];
306 let schema = infer_schema(&records);
307 assert_eq!(schema["properties"]["tags"]["type"], "array");
308 assert_eq!(schema["properties"]["tags"]["items"]["type"], "string");
309 }
310
311 #[test]
312 fn test_infer_schema_nested_object() {
313 let records = vec![
314 json!({"address": {"city": "NYC", "zip": "10001"}}),
315 json!({"address": {"city": "LA"}}),
316 ];
317 let schema = infer_schema(&records);
318 let addr = &schema["properties"]["address"];
319 assert_eq!(addr["type"], "object");
320 assert_eq!(addr["properties"]["city"]["type"], "string");
321 let zip_type = &addr["properties"]["zip"]["type"];
323 assert!(
324 zip_type == &json!(["null", "string"]) || zip_type == &json!(["string", "null"]),
325 "expected nullable string, got {zip_type}"
326 );
327 }
328
329 #[test]
330 fn test_infer_schema_empty_records() {
331 let schema = infer_schema(&[]);
332 assert_eq!(schema["type"], "object");
333 assert_eq!(schema["properties"], json!({}));
334 }
335
336 #[test]
337 fn test_infer_schema_skips_non_objects() {
338 let records = vec![json!("string"), json!(42), json!({"id": 1})];
340 let schema = infer_schema(&records);
341 assert_eq!(schema["properties"]["id"]["type"], "integer");
342 }
343
344 #[test]
345 fn test_add_null_type_idempotent() {
346 let mut s = json!({"type": ["null", "string"]});
347 add_null_type(&mut s);
348 assert_eq!(s["type"], json!(["null", "string"]));
350 }
351
352 #[test]
353 fn test_merge_schemas_object_merges_properties() {
354 let a = json!({"type": "object", "properties": {"x": {"type": "integer"}}});
355 let b = json!({"type": "object", "properties": {"y": {"type": "string"}}});
356 let merged = merge_schemas(a, b);
357 assert_eq!(merged["type"], "object");
358 let x_type = &merged["properties"]["x"]["type"];
360 assert!(
361 x_type == &json!(["integer", "null"]) || x_type == &json!(["null", "integer"]),
362 "got {x_type}"
363 );
364 let y_type = &merged["properties"]["y"]["type"];
366 assert!(
367 y_type == &json!(["null", "string"]) || y_type == &json!(["string", "null"]),
368 "got {y_type}"
369 );
370 }
371
372 #[test]
373 fn test_merge_schemas_array_items_merged() {
374 let a = json!({"type": "array", "items": {"type": "integer"}});
375 let b = json!({"type": "array", "items": {"type": "string"}});
376 let merged = merge_schemas(a, b);
377 assert_eq!(merged["type"], "array");
378 let items_type = &merged["items"]["type"];
379 assert!(
380 items_type == &json!(["integer", "string"])
381 || items_type == &json!(["string", "integer"]),
382 "got {items_type}"
383 );
384 }
385
386 #[test]
387 fn test_merge_schemas_array_object_union_preserves_items_and_properties() {
388 let arr = json!({"type": "array", "items": {"type": "integer"}});
392 let obj = json!({"type": "object", "properties": {"k": {"type": "string"}}});
393 let merged = merge_schemas(arr, obj);
394 let types = &merged["type"];
395 assert!(
396 types == &json!(["array", "object"]) || types == &json!(["object", "array"]),
397 "got {types}"
398 );
399 assert_eq!(merged["items"]["type"], "integer", "array items dropped");
400 let k_type = &merged["properties"]["k"]["type"];
402 assert!(
403 k_type == &json!(["null", "string"]) || k_type == &json!(["string", "null"]),
404 "got {k_type}"
405 );
406 }
407
408 #[test]
409 fn test_merge_schemas_unknown_array_items_omitted() {
410 let a = json!({"type": "array", "items": {}});
413 let b = json!({"type": "array", "items": {}});
414 let merged = merge_schemas(a, b);
415 assert_eq!(merged["type"], "array");
416 assert!(merged.get("items").is_none(), "got {merged}");
417 }
418}