use std::collections::HashMap;
use serde_json::Value;
use super::ast::{Grammar, NonTerminalId, Rule, Symbol};
#[derive(Debug, Clone)]
pub enum JsonSchemaCompileError {
InvalidSchema(String),
UnsupportedKeyword(String),
DanglingRef(String),
DepthExceeded {
limit: usize,
},
InvalidJson(String),
}
impl std::fmt::Display for JsonSchemaCompileError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidSchema(msg) => write!(f, "invalid JSON schema: {msg}"),
Self::UnsupportedKeyword(kw) => write!(f, "unsupported JSON schema keyword: {kw}"),
Self::DanglingRef(r) => write!(f, "unresolvable $ref: {r}"),
Self::DepthExceeded { limit } => {
write!(f, "schema nesting depth exceeded limit of {limit}")
}
Self::InvalidJson(msg) => write!(f, "invalid JSON input: {msg}"),
}
}
}
impl std::error::Error for JsonSchemaCompileError {}
pub fn compile_json_schema(schema: &Value) -> Result<Grammar, JsonSchemaCompileError> {
Compiler::new().compile(schema)
}
pub fn compile_json_schema_str(schema_json: &str) -> Result<Grammar, JsonSchemaCompileError> {
let value: Value = serde_json::from_str(schema_json)
.map_err(|e| JsonSchemaCompileError::InvalidJson(e.to_string()))?;
compile_json_schema(&value)
}
const MAX_DEPTH: usize = 32;
struct Compiler {
grammar: Grammar,
defs_nt: HashMap<String, NonTerminalId>,
defs_compiled: HashMap<String, bool>,
string_nt: Option<NonTerminalId>,
digit_nt: Option<NonTerminalId>,
}
impl Compiler {
fn new() -> Self {
let grammar = Grammar::new(0);
Self {
grammar,
defs_nt: HashMap::new(),
defs_compiled: HashMap::new(),
string_nt: None,
digit_nt: None,
}
}
fn compile(mut self, root: &Value) -> Result<Grammar, JsonSchemaCompileError> {
self.pass1_alloc_defs(root);
let def_keys: Vec<String> = self.defs_nt.keys().cloned().collect();
for key in &def_keys {
self.pass2_compile_def(key, root)?;
}
let start_nt = self.compile_schema(root, 0)?;
self.grammar.start = start_nt;
Ok(self.grammar)
}
fn pass1_alloc_defs(&mut self, root: &Value) {
let defs = root
.get("$defs")
.or_else(|| root.get("definitions"))
.and_then(|v| v.as_object());
if let Some(map) = defs {
for key in map.keys() {
if !self.defs_nt.contains_key(key) {
let nt = self.grammar.alloc_nt(format!("$def_{key}"));
self.defs_nt.insert(key.clone(), nt);
self.defs_compiled.insert(key.clone(), false);
}
}
}
}
fn pass2_compile_def(&mut self, key: &str, root: &Value) -> Result<(), JsonSchemaCompileError> {
if *self.defs_compiled.get(key).unwrap_or(&false) {
return Ok(());
}
self.defs_compiled.insert(key.to_string(), true);
let def_nt = *self.defs_nt.get(key).expect("NT pre-allocated in pass 1");
let body_value = root
.get("$defs")
.or_else(|| root.get("definitions"))
.and_then(|v| v.get(key))
.ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema(format!("$defs key '{key}' not found"))
})?;
let compiled_nt = self.compile_schema(body_value, 0)?;
if compiled_nt != def_nt {
self.grammar
.add_rule(Rule::new(def_nt, vec![Symbol::NonTerminal(compiled_nt)]));
}
Ok(())
}
fn compile_schema(
&mut self,
schema: &Value,
depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
if depth > MAX_DEPTH {
return Err(JsonSchemaCompileError::DepthExceeded { limit: MAX_DEPTH });
}
if schema.is_boolean() {
let nt = self.grammar.alloc_nt("__bool_schema");
self.grammar
.add_rule(Rule::new(nt, vec![Symbol::Terminal(vec![])]));
return Ok(nt);
}
let obj = match schema.as_object() {
Some(o) => o,
None => {
return Err(JsonSchemaCompileError::InvalidSchema(
"schema must be a JSON object or boolean".to_string(),
));
}
};
for unsupported in &[
"not",
"if",
"then",
"else",
"patternProperties",
"pattern",
"format",
"multipleOf",
"exclusiveMinimum",
"exclusiveMaximum",
] {
if obj.contains_key(*unsupported) {
return Err(JsonSchemaCompileError::UnsupportedKeyword(
unsupported.to_string(),
));
}
}
if let Some(ref_val) = obj.get("$ref") {
return self.compile_ref(ref_val);
}
if let Some(enum_val) = obj.get("enum") {
return self.compile_enum(enum_val, depth);
}
if let Some(any_of) = obj.get("anyOf") {
return self.compile_any_of(any_of, depth);
}
if let Some(one_of) = obj.get("oneOf") {
return self.compile_any_of(one_of, depth);
}
if let Some(all_of) = obj.get("allOf") {
return self.compile_all_of(all_of, schema, depth);
}
match obj.get("type").and_then(|v| v.as_str()) {
Some("string") => self.compile_string_type(),
Some("integer") => self.compile_integer_type(),
Some("number") => self.compile_number_type(),
Some("boolean") => Ok(self.compile_boolean_type()),
Some("null") => Ok(self.compile_null_type()),
Some("object") => self.compile_object_type(schema, depth),
Some("array") => self.compile_array_type(schema, depth),
Some(other) => Err(JsonSchemaCompileError::InvalidSchema(format!(
"unknown type: '{other}'"
))),
None => {
Ok(self.compile_any_value_type())
}
}
}
fn compile_ref(&mut self, ref_val: &Value) -> Result<NonTerminalId, JsonSchemaCompileError> {
let ref_str = ref_val.as_str().ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema("$ref must be a string".to_string())
})?;
let key = if let Some(k) = ref_str.strip_prefix("#/$defs/") {
k
} else if let Some(k) = ref_str.strip_prefix("#/definitions/") {
k
} else {
return Err(JsonSchemaCompileError::UnsupportedKeyword(format!(
"$ref to external schema or unsupported path: {ref_str}"
)));
};
self.defs_nt
.get(key)
.copied()
.ok_or_else(|| JsonSchemaCompileError::DanglingRef(ref_str.to_string()))
}
fn compile_enum(
&mut self,
enum_val: &Value,
_depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
let values = enum_val.as_array().ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema("\"enum\" value must be a JSON array".to_string())
})?;
if values.is_empty() {
return Err(JsonSchemaCompileError::InvalidSchema(
"\"enum\" array must not be empty".to_string(),
));
}
let enum_nt = self.grammar.alloc_nt("__enum");
for v in values {
let literal = json_value_to_literal(v)?;
self.grammar
.add_rule(Rule::new(enum_nt, vec![Symbol::Terminal(literal)]));
}
Ok(enum_nt)
}
fn compile_any_of(
&mut self,
arr: &Value,
depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
let variants = arr.as_array().ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema("anyOf/oneOf must be an array".to_string())
})?;
if variants.is_empty() {
return Err(JsonSchemaCompileError::InvalidSchema(
"anyOf/oneOf must have at least one variant".to_string(),
));
}
let any_nt = self.grammar.alloc_nt("__anyOf");
for variant in variants {
let var_nt = self.compile_schema(variant, depth + 1)?;
self.grammar
.add_rule(Rule::new(any_nt, vec![Symbol::NonTerminal(var_nt)]));
}
Ok(any_nt)
}
fn compile_all_of(
&mut self,
all_of_arr: &Value,
_parent: &Value,
depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
let variants = all_of_arr.as_array().ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema("allOf must be an array".to_string())
})?;
if variants.is_empty() {
return Err(JsonSchemaCompileError::InvalidSchema(
"allOf must have at least one element".to_string(),
));
}
for variant in variants {
let is_object = variant
.get("type")
.and_then(|v| v.as_str())
.map(|t| t == "object")
.unwrap_or(false)
|| variant.get("properties").is_some();
if !is_object {
return Err(JsonSchemaCompileError::UnsupportedKeyword(
"allOf can only merge object schemas in this compiler".to_string(),
));
}
}
let mut merged_props: serde_json::Map<String, Value> = serde_json::Map::new();
let mut merged_required: Vec<String> = Vec::new();
let mut has_additional_false = false;
for variant in variants {
if let Some(props) = variant.get("properties").and_then(|v| v.as_object()) {
for (k, v) in props {
merged_props.insert(k.clone(), v.clone());
}
}
if let Some(req) = variant.get("required").and_then(|v| v.as_array()) {
for r in req {
if let Some(s) = r.as_str() {
if !merged_required.contains(&s.to_string()) {
merged_required.push(s.to_string());
}
}
}
}
if variant
.get("additionalProperties")
.and_then(|v| v.as_bool())
== Some(false)
{
has_additional_false = true;
}
}
let mut merged = serde_json::json!({
"type": "object",
"properties": merged_props,
"required": merged_required,
});
if has_additional_false {
if let Some(obj) = merged.as_object_mut() {
obj.insert("additionalProperties".to_string(), Value::Bool(false));
}
}
self.compile_object_type(&merged, depth)
}
fn compile_string_type(&mut self) -> Result<NonTerminalId, JsonSchemaCompileError> {
if let Some(nt) = self.string_nt {
return Ok(nt);
}
let str_nt = self.grammar.alloc_nt("__string");
let chars_nt = self.grammar.alloc_nt("__string_chars");
let char_nt = self.grammar.alloc_nt("__string_char");
self.grammar.add_rule(Rule::new(
str_nt,
vec![Symbol::Terminal(vec![b'"']), Symbol::Terminal(vec![b'"'])],
));
self.grammar.add_rule(Rule::new(
str_nt,
vec![
Symbol::Terminal(vec![b'"']),
Symbol::NonTerminal(chars_nt),
Symbol::Terminal(vec![b'"']),
],
));
self.grammar
.add_rule(Rule::new(chars_nt, vec![Symbol::NonTerminal(char_nt)]));
self.grammar.add_rule(Rule::new(
chars_nt,
vec![Symbol::NonTerminal(char_nt), Symbol::NonTerminal(chars_nt)],
));
for b in 0x20u8..=0x21u8 {
self.grammar
.add_rule(Rule::new(char_nt, vec![Symbol::Terminal(vec![b])]));
}
for b in 0x23u8..=0x5Bu8 {
self.grammar
.add_rule(Rule::new(char_nt, vec![Symbol::Terminal(vec![b])]));
}
for b in 0x5Du8..=0x7Eu8 {
self.grammar
.add_rule(Rule::new(char_nt, vec![Symbol::Terminal(vec![b])]));
}
self.string_nt = Some(str_nt);
Ok(str_nt)
}
fn ensure_digit_nt(&mut self) -> NonTerminalId {
if let Some(nt) = self.digit_nt {
return nt;
}
let digit_nt = self.grammar.alloc_nt("__digit");
for b in b'0'..=b'9' {
self.grammar
.add_rule(Rule::new(digit_nt, vec![Symbol::Terminal(vec![b])]));
}
self.digit_nt = Some(digit_nt);
digit_nt
}
fn compile_integer_type(&mut self) -> Result<NonTerminalId, JsonSchemaCompileError> {
let digit_nt = self.ensure_digit_nt();
let digits_nt = self.grammar.alloc_nt("__digits");
self.grammar
.add_rule(Rule::new(digits_nt, vec![Symbol::NonTerminal(digit_nt)]));
self.grammar.add_rule(Rule::new(
digits_nt,
vec![
Symbol::NonTerminal(digit_nt),
Symbol::NonTerminal(digits_nt),
],
));
let int_nt = self.grammar.alloc_nt("__integer");
self.grammar
.add_rule(Rule::new(int_nt, vec![Symbol::NonTerminal(digits_nt)]));
self.grammar.add_rule(Rule::new(
int_nt,
vec![Symbol::Terminal(vec![b'-']), Symbol::NonTerminal(digits_nt)],
));
Ok(int_nt)
}
fn compile_number_type(&mut self) -> Result<NonTerminalId, JsonSchemaCompileError> {
let int_nt = self.compile_integer_type()?;
let digit_nt = self.ensure_digit_nt();
let frac_digits_nt = self.grammar.alloc_nt("__frac_digits");
self.grammar.add_rule(Rule::new(
frac_digits_nt,
vec![Symbol::NonTerminal(digit_nt)],
));
self.grammar.add_rule(Rule::new(
frac_digits_nt,
vec![
Symbol::NonTerminal(digit_nt),
Symbol::NonTerminal(frac_digits_nt),
],
));
let num_nt = self.grammar.alloc_nt("__number");
self.grammar
.add_rule(Rule::new(num_nt, vec![Symbol::NonTerminal(int_nt)]));
self.grammar.add_rule(Rule::new(
num_nt,
vec![
Symbol::NonTerminal(int_nt),
Symbol::Terminal(vec![b'.']),
Symbol::NonTerminal(frac_digits_nt),
],
));
Ok(num_nt)
}
fn compile_boolean_type(&mut self) -> NonTerminalId {
let bool_nt = self.grammar.alloc_nt("__boolean");
self.grammar
.add_rule(Rule::new(bool_nt, vec![Symbol::Terminal(b"true".to_vec())]));
self.grammar.add_rule(Rule::new(
bool_nt,
vec![Symbol::Terminal(b"false".to_vec())],
));
bool_nt
}
fn compile_null_type(&mut self) -> NonTerminalId {
let null_nt = self.grammar.alloc_nt("__null");
self.grammar
.add_rule(Rule::new(null_nt, vec![Symbol::Terminal(b"null".to_vec())]));
null_nt
}
fn compile_any_value_type(&mut self) -> NonTerminalId {
let val_nt = self.grammar.alloc_nt("__any_value");
let str_nt = self.string_nt.unwrap_or_else(|| {
self.compile_string_type().expect("string type compile")
});
let bool_nt = self.compile_boolean_type();
let null_nt = self.compile_null_type();
let digit_nt = self.ensure_digit_nt();
let digits_nt = self.grammar.alloc_nt("__any_val_digits");
self.grammar
.add_rule(Rule::new(digits_nt, vec![Symbol::NonTerminal(digit_nt)]));
self.grammar.add_rule(Rule::new(
digits_nt,
vec![
Symbol::NonTerminal(digit_nt),
Symbol::NonTerminal(digits_nt),
],
));
let num_nt = self.grammar.alloc_nt("__any_val_num");
self.grammar
.add_rule(Rule::new(num_nt, vec![Symbol::NonTerminal(digits_nt)]));
self.grammar.add_rule(Rule::new(
num_nt,
vec![Symbol::Terminal(vec![b'-']), Symbol::NonTerminal(digits_nt)],
));
let obj_stub_nt = self.grammar.alloc_nt("__any_val_obj");
self.grammar.add_rule(Rule::new(
obj_stub_nt,
vec![Symbol::Terminal(vec![b'{']), Symbol::Terminal(vec![b'}'])],
));
let arr_stub_nt = self.grammar.alloc_nt("__any_val_arr");
self.grammar.add_rule(Rule::new(
arr_stub_nt,
vec![Symbol::Terminal(vec![b'[']), Symbol::Terminal(vec![b']'])],
));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(str_nt)]));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(bool_nt)]));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(null_nt)]));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(num_nt)]));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(obj_stub_nt)]));
self.grammar
.add_rule(Rule::new(val_nt, vec![Symbol::NonTerminal(arr_stub_nt)]));
val_nt
}
fn compile_object_type(
&mut self,
schema: &Value,
depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
if let Some(ap) = schema.get("additionalProperties") {
if !ap.is_boolean() {
return Err(JsonSchemaCompileError::UnsupportedKeyword(
"additionalProperties as a subschema is not supported; use false or omit it"
.to_string(),
));
}
}
let properties = schema
.get("properties")
.and_then(|v| v.as_object())
.cloned()
.unwrap_or_default();
let required: Vec<String> = schema
.get("required")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.collect()
})
.unwrap_or_default();
let obj_nt = self.grammar.alloc_nt("__object");
if required.is_empty() {
self.grammar.add_rule(Rule::new(
obj_nt,
vec![Symbol::Terminal(vec![b'{']), Symbol::Terminal(vec![b'}'])],
));
return Ok(obj_nt);
}
let mut prop_nts: Vec<(String, NonTerminalId)> = Vec::new();
for prop_name in &required {
let sub_schema = properties.get(prop_name).ok_or_else(|| {
JsonSchemaCompileError::InvalidSchema(format!(
"required property '{prop_name}' not found in 'properties'"
))
})?;
let val_nt = self.compile_schema(sub_schema, depth + 1)?;
prop_nts.push((prop_name.clone(), val_nt));
}
let mut body: Vec<Symbol> = Vec::new();
body.push(Symbol::Terminal(vec![b'{']));
for (i, (prop_name, val_nt)) in prop_nts.iter().enumerate() {
if i > 0 {
body.push(Symbol::Terminal(vec![b',']));
}
let key_bytes = json_string_literal_bytes(prop_name);
body.push(Symbol::Terminal(key_bytes));
body.push(Symbol::Terminal(vec![b':']));
body.push(Symbol::NonTerminal(*val_nt));
}
body.push(Symbol::Terminal(vec![b'}']));
self.grammar.add_rule(Rule::new(obj_nt, body));
Ok(obj_nt)
}
fn compile_array_type(
&mut self,
schema: &Value,
depth: usize,
) -> Result<NonTerminalId, JsonSchemaCompileError> {
let items_schema = schema.get("items");
let item_nt = if let Some(items) = items_schema {
self.compile_schema(items, depth + 1)?
} else {
self.compile_any_value_type()
};
let items_nt = self.grammar.alloc_nt("__array_items");
self.grammar
.add_rule(Rule::new(items_nt, vec![Symbol::NonTerminal(item_nt)]));
self.grammar.add_rule(Rule::new(
items_nt,
vec![
Symbol::NonTerminal(item_nt),
Symbol::Terminal(vec![b',']),
Symbol::NonTerminal(items_nt),
],
));
let arr_nt = self.grammar.alloc_nt("__array");
self.grammar.add_rule(Rule::new(
arr_nt,
vec![Symbol::Terminal(vec![b'[']), Symbol::Terminal(vec![b']'])],
));
self.grammar.add_rule(Rule::new(
arr_nt,
vec![
Symbol::Terminal(vec![b'[']),
Symbol::NonTerminal(items_nt),
Symbol::Terminal(vec![b']']),
],
));
Ok(arr_nt)
}
}
fn json_value_to_literal(v: &Value) -> Result<Vec<u8>, JsonSchemaCompileError> {
match v {
Value::String(_) => {
let json_repr = serde_json::to_string(v)
.map_err(|e| JsonSchemaCompileError::InvalidSchema(e.to_string()))?;
Ok(json_repr.into_bytes())
}
Value::Number(n) => {
if n.is_i64() || n.is_u64() {
Ok(n.to_string().into_bytes())
} else {
Err(JsonSchemaCompileError::UnsupportedKeyword(
"float enum values are not supported".to_string(),
))
}
}
Value::Bool(b) => {
if *b {
Ok(b"true".to_vec())
} else {
Ok(b"false".to_vec())
}
}
Value::Null => Ok(b"null".to_vec()),
_ => Err(JsonSchemaCompileError::UnsupportedKeyword(
"enum values must be strings, integers, booleans, or null".to_string(),
)),
}
}
fn json_string_literal_bytes(key: &str) -> Vec<u8> {
serde_json::to_string(key)
.expect("key serialization must succeed")
.into_bytes()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn compile_string_gives_grammar() {
let g = compile_json_schema_str(r#"{"type":"string"}"#).expect("should compile");
assert!(!g.rules.is_empty());
}
#[test]
fn compile_boolean_gives_grammar() {
let g = compile_json_schema_str(r#"{"type":"boolean"}"#).expect("should compile");
assert!(!g.rules.is_empty());
}
#[test]
fn json_value_to_literal_string() {
let v = Value::String("hello".to_string());
let b = json_value_to_literal(&v).expect("ok");
assert_eq!(b, br#""hello""#);
}
#[test]
fn json_value_to_literal_integer() {
let v = Value::Number(serde_json::Number::from(42));
let b = json_value_to_literal(&v).expect("ok");
assert_eq!(b, b"42");
}
#[test]
fn json_value_to_literal_bool() {
let b = json_value_to_literal(&Value::Bool(true)).expect("ok");
assert_eq!(b, b"true");
}
#[test]
fn json_value_to_literal_null() {
let b = json_value_to_literal(&Value::Null).expect("ok");
assert_eq!(b, b"null");
}
#[test]
fn json_string_literal_bytes_simple() {
let b = json_string_literal_bytes("name");
assert_eq!(b, br#""name""#);
}
}