Skip to main content

hedl_json/
schema_gen.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! JSON Schema (Draft 7) generation from HEDL documents.
19//!
20//! This module provides comprehensive JSON Schema generation with support for:
21//!
22//! - **All HEDL Types**: Scalars, tensors, references, expressions, structs
23//! - **Type Inference**: Smart format detection (email, URI, date-time)
24//! - **%NEST:Relationships**: Hierarchical structures with nested arrays
25//! - **Schema Validation**: Validates generated schemas for correctness
26//! - **Configuration**: Title, description, strict mode, examples
27//!
28//! # Examples
29//!
30//! ## Basic Schema Generation
31//!
32//! ```rust
33//! use hedl_core::parse;
34//! use hedl_json::schema_gen::{generate_schema, SchemaConfig};
35//!
36//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
37//! let hedl = r#"
38//! name: Alice
39//! age: 30
40//! active: true
41//! "#;
42//!
43//! let doc = parse(hedl.as_bytes())?;
44//! let schema = generate_schema(&doc, &SchemaConfig::default())?;
45//! println!("{}", schema);
46//! # Ok(())
47//! # }
48//! ```
49//!
50//! ## Schema with %STRUCT:Definitions
51//!
52//! ```rust
53//! use hedl_core::parse;
54//! use hedl_json::schema_gen::{generate_schema, SchemaConfig};
55//!
56//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
57//! let hedl = r#"
58//! %STRUCT: User: [id, name, email]
59//! users:@User
60//!   u1, Alice, alice@example.com
61//! "#;
62//!
63//! let doc = parse(hedl.as_bytes())?;
64//! let config = SchemaConfig::builder()
65//!     .title("User API Schema")
66//!     .strict(true)
67//!     .build();
68//! let schema = generate_schema(&doc, &config)?;
69//! # Ok(())
70//! # }
71//! ```
72//!
73//! ## Schema with %NEST:Relationships
74//!
75//! ```rust
76//! use hedl_core::parse;
77//! use hedl_json::schema_gen::{generate_schema, SchemaConfig};
78//!
79//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
80//! let hedl = r#"
81//! %STRUCT: Team: [id, name]
82//! %STRUCT: Member: [id, name]
83//! %NEST: Team > Member
84//!
85//! teams:@Team
86//!   t1, Engineering
87//! "#;
88//!
89//! let doc = parse(hedl.as_bytes())?;
90//! let schema = generate_schema(&doc, &SchemaConfig::default())?;
91//! # Ok(())
92//! # }
93//! ```
94
95use hedl_core::lex::Tensor;
96use hedl_core::{Document, Item, MatrixList, Value};
97use serde_json::{json, Map, Value as JsonValue};
98use std::collections::BTreeMap;
99use thiserror::Error;
100
101/// Errors that can occur during schema generation
102#[derive(Error, Debug)]
103pub enum SchemaError {
104    /// Schema validation failed
105    #[error("Schema validation failed: {0}")]
106    ValidationError(String),
107
108    /// Invalid configuration
109    #[error("Invalid configuration: {0}")]
110    ConfigError(String),
111
112    /// Serialization error
113    #[error("Serialization error: {0}")]
114    SerializationError(#[from] serde_json::Error),
115
116    /// Internal error
117    #[error("Internal error: {0}")]
118    InternalError(String),
119}
120
121/// Configuration for JSON Schema generation
122#[derive(Debug, Clone)]
123pub struct SchemaConfig {
124    /// Schema title (optional)
125    pub title: Option<String>,
126    /// Schema description (optional)
127    pub description: Option<String>,
128    /// Schema $id URI (optional)
129    pub schema_id: Option<String>,
130    /// Strict mode: disallow additional properties (default: false)
131    pub strict: bool,
132    /// Include example values in schema (default: false)
133    pub include_examples: bool,
134    /// Include metadata fields like title, description, $id (default: true)
135    pub include_metadata: bool,
136}
137
138impl Default for SchemaConfig {
139    fn default() -> Self {
140        Self {
141            title: None,
142            description: None,
143            schema_id: None,
144            strict: false,
145            include_examples: false,
146            include_metadata: true,
147        }
148    }
149}
150
151impl SchemaConfig {
152    /// Create a new builder for `SchemaConfig`
153    #[must_use]
154    pub fn builder() -> SchemaConfigBuilder {
155        SchemaConfigBuilder::default()
156    }
157}
158
159/// Builder for `SchemaConfig`
160#[derive(Debug)]
161pub struct SchemaConfigBuilder {
162    title: Option<String>,
163    description: Option<String>,
164    schema_id: Option<String>,
165    strict: bool,
166    include_examples: bool,
167    include_metadata: bool,
168}
169
170impl Default for SchemaConfigBuilder {
171    fn default() -> Self {
172        Self {
173            title: None,
174            description: None,
175            schema_id: None,
176            strict: false,
177            include_examples: false,
178            include_metadata: true, // Default to true to match SchemaConfig
179        }
180    }
181}
182
183impl SchemaConfigBuilder {
184    /// Set the schema title
185    pub fn title(mut self, title: impl Into<String>) -> Self {
186        self.title = Some(title.into());
187        self
188    }
189
190    /// Set the schema description
191    pub fn description(mut self, description: impl Into<String>) -> Self {
192        self.description = Some(description.into());
193        self
194    }
195
196    /// Set the schema $id URI
197    pub fn schema_id(mut self, schema_id: impl Into<String>) -> Self {
198        self.schema_id = Some(schema_id.into());
199        self
200    }
201
202    /// Enable strict mode (disallow additional properties)
203    #[must_use]
204    pub fn strict(mut self, strict: bool) -> Self {
205        self.strict = strict;
206        self
207    }
208
209    /// Include example values in schema
210    #[must_use]
211    pub fn include_examples(mut self, include: bool) -> Self {
212        self.include_examples = include;
213        self
214    }
215
216    /// Include metadata fields (title, description, $id)
217    #[must_use]
218    pub fn include_metadata(mut self, include: bool) -> Self {
219        self.include_metadata = include;
220        self
221    }
222
223    /// Build the `SchemaConfig`
224    #[must_use]
225    pub fn build(self) -> SchemaConfig {
226        SchemaConfig {
227            title: self.title,
228            description: self.description,
229            schema_id: self.schema_id,
230            strict: self.strict,
231            include_examples: self.include_examples,
232            include_metadata: self.include_metadata,
233        }
234    }
235}
236
237/// Generate JSON Schema (Draft 7) from HEDL document as a JSON string
238///
239/// # Arguments
240///
241/// * `doc` - The HEDL document to convert
242/// * `config` - Schema generation configuration
243///
244/// # Returns
245///
246/// A pretty-printed JSON Schema string
247///
248/// # Errors
249///
250/// Returns error if schema generation or serialization fails
251pub fn generate_schema(doc: &Document, config: &SchemaConfig) -> Result<String, SchemaError> {
252    let schema = generate_schema_value(doc, config)?;
253    Ok(serde_json::to_string_pretty(&schema)?)
254}
255
256/// Generate JSON Schema (Draft 7) from HEDL document as a `JsonValue`
257///
258/// # Arguments
259///
260/// * `doc` - The HEDL document to convert
261/// * `config` - Schema generation configuration
262///
263/// # Returns
264///
265/// A `JsonValue` representing the JSON Schema
266///
267/// # Errors
268///
269/// Returns error if schema generation fails
270pub fn generate_schema_value(
271    doc: &Document,
272    config: &SchemaConfig,
273) -> Result<JsonValue, SchemaError> {
274    let mut schema = Map::with_capacity(8);
275
276    // Required: $schema field
277    schema.insert(
278        "$schema".to_string(),
279        json!("http://json-schema.org/draft-07/schema#"),
280    );
281
282    // Optional metadata fields
283    if config.include_metadata {
284        if let Some(ref title) = config.title {
285            schema.insert("title".to_string(), json!(title));
286        }
287        if let Some(ref description) = config.description {
288            schema.insert("description".to_string(), json!(description));
289        }
290        if let Some(ref schema_id) = config.schema_id {
291            schema.insert("$id".to_string(), json!(schema_id));
292        }
293    }
294
295    // Root type is always object
296    schema.insert("type".to_string(), json!("object"));
297
298    // Generate definitions from %STRUCT:declarations
299    if !doc.structs.is_empty() {
300        let definitions = generate_definitions(doc, config);
301        schema.insert("definitions".to_string(), JsonValue::Object(definitions));
302    }
303
304    // Generate properties from root items
305    let properties = generate_properties(&doc.root, doc, config);
306    schema.insert("properties".to_string(), JsonValue::Object(properties));
307
308    // Strict mode: no additional properties
309    if config.strict {
310        schema.insert("additionalProperties".to_string(), json!(false));
311    }
312
313    Ok(JsonValue::Object(schema))
314}
315
316/// Generate schema definitions from %STRUCT:declarations
317fn generate_definitions(doc: &Document, config: &SchemaConfig) -> Map<String, JsonValue> {
318    let mut definitions = Map::with_capacity(doc.structs.len());
319
320    for (type_name, schema_fields) in &doc.structs {
321        let mut def = Map::with_capacity(4);
322        def.insert("type".to_string(), json!("object"));
323
324        // Generate properties for struct fields
325        let mut properties = Map::with_capacity(schema_fields.len());
326
327        for field_name in schema_fields {
328            // Infer type from actual data if available
329            let field_schema = infer_field_type(type_name, field_name, doc, config);
330            properties.insert(field_name.clone(), field_schema);
331        }
332
333        // Add nested children if %NEST:relationship exists
334        if let Some(child_types) = doc.nests.get(type_name) {
335            for child_type in child_types {
336                let child_array_name = pluralize(child_type);
337                let child_ref = json!({
338                    "type": "array",
339                    "items": {
340                        "$ref": format!("#/definitions/{}", child_type)
341                    }
342                });
343                properties.insert(child_array_name, child_ref);
344            }
345        }
346
347        def.insert("properties".to_string(), JsonValue::Object(properties));
348
349        // Required fields: only "id" is required (first column)
350        if !schema_fields.is_empty() {
351            def.insert("required".to_string(), json!([schema_fields[0]]));
352        }
353
354        // Strict mode: no additional properties
355        if config.strict {
356            def.insert("additionalProperties".to_string(), json!(false));
357        }
358
359        definitions.insert(type_name.clone(), JsonValue::Object(def));
360    }
361
362    definitions
363}
364
365/// Generate properties from root items
366fn generate_properties(
367    items: &BTreeMap<String, Item>,
368    doc: &Document,
369    config: &SchemaConfig,
370) -> Map<String, JsonValue> {
371    let mut properties = Map::with_capacity(items.len());
372
373    for (key, item) in items {
374        let prop_schema = item_to_schema(item, doc, config);
375        properties.insert(key.clone(), prop_schema);
376    }
377
378    properties
379}
380
381/// Convert an Item to a JSON Schema property
382fn item_to_schema(item: &Item, doc: &Document, config: &SchemaConfig) -> JsonValue {
383    match item {
384        Item::Scalar(value) => value_to_schema(value, None, config),
385        Item::Object(obj) => object_to_schema(obj, doc, config),
386        Item::List(list) => matrix_list_to_schema(list, config),
387    }
388}
389
390/// Convert a Value to a JSON Schema type
391fn value_to_schema(value: &Value, field_name: Option<&str>, config: &SchemaConfig) -> JsonValue {
392    let mut schema = Map::with_capacity(4);
393
394    match value {
395        Value::Null => {
396            schema.insert("type".to_string(), json!("null"));
397        }
398        Value::Bool(b) => {
399            schema.insert("type".to_string(), json!("boolean"));
400            if config.include_examples {
401                schema.insert("examples".to_string(), json!([b]));
402            }
403        }
404        Value::Int(n) => {
405            schema.insert("type".to_string(), json!("integer"));
406            if config.include_examples {
407                schema.insert("examples".to_string(), json!([n]));
408            }
409        }
410        Value::Float(f) => {
411            schema.insert("type".to_string(), json!("number"));
412            if config.include_examples {
413                schema.insert("examples".to_string(), json!([f]));
414            }
415        }
416        Value::String(s) => {
417            schema.insert("type".to_string(), json!("string"));
418
419            // Smart format detection
420            if let Some(format) = infer_string_format(s, field_name) {
421                schema.insert("format".to_string(), json!(format));
422            }
423
424            if config.include_examples {
425                schema.insert("examples".to_string(), json!([s]));
426            }
427        }
428        Value::Tensor(tensor) => {
429            // Tensor schema depends on shape
430            return tensor_to_schema(tensor, config);
431        }
432        Value::Reference(reference) => {
433            schema.insert("type".to_string(), json!("string"));
434            schema.insert(
435                "pattern".to_string(),
436                json!("^@([A-Z][a-zA-Z0-9]*:)?[a-zA-Z0-9_-]+$"),
437            );
438            schema.insert(
439                "description".to_string(),
440                json!(format!(
441                    "Reference to {}",
442                    reference.type_name.as_deref().unwrap_or("entity")
443                )),
444            );
445        }
446        Value::Expression(_) => {
447            schema.insert("type".to_string(), json!("string"));
448            schema.insert("pattern".to_string(), json!(r"^\$\(.+\)$"));
449            schema.insert("description".to_string(), json!("HEDL expression $(...)"));
450        }
451        Value::List(values) => {
452            schema.insert("type".to_string(), json!("array"));
453
454            // Try to infer item schema from first element
455            if let Some(first) = values.first() {
456                let item_schema = value_to_schema(first, None, config);
457                schema.insert("items".to_string(), item_schema);
458            }
459
460            // Note: examples omitted for List since Value doesn't implement Serialize
461            // This would require converting to JSON representation first
462        }
463    }
464
465    JsonValue::Object(schema)
466}
467
468/// Convert an object to a JSON Schema
469fn object_to_schema(
470    obj: &BTreeMap<String, Item>,
471    doc: &Document,
472    config: &SchemaConfig,
473) -> JsonValue {
474    let mut schema = Map::with_capacity(3);
475    schema.insert("type".to_string(), json!("object"));
476
477    let properties = generate_properties(obj, doc, config);
478    schema.insert("properties".to_string(), JsonValue::Object(properties));
479
480    if config.strict {
481        schema.insert("additionalProperties".to_string(), json!(false));
482    }
483
484    JsonValue::Object(schema)
485}
486
487/// Convert a matrix list to a JSON Schema
488fn matrix_list_to_schema(list: &MatrixList, _config: &SchemaConfig) -> JsonValue {
489    let mut schema = Map::with_capacity(2);
490    schema.insert("type".to_string(), json!("array"));
491
492    // Reference to the struct definition
493    let items = json!({
494        "$ref": format!("#/definitions/{}", list.type_name)
495    });
496    schema.insert("items".to_string(), items);
497
498    JsonValue::Object(schema)
499}
500
501/// Convert a tensor to a JSON Schema
502fn tensor_to_schema(tensor: &Tensor, config: &SchemaConfig) -> JsonValue {
503    match tensor {
504        Tensor::Scalar(val) => {
505            let mut schema = Map::with_capacity(2);
506            schema.insert("type".to_string(), json!("number"));
507            if config.include_examples {
508                schema.insert("examples".to_string(), json!([val]));
509            }
510            JsonValue::Object(schema)
511        }
512        Tensor::Array(_) => {
513            // Multi-dimensional array
514            json!({
515                "type": "array",
516                "items": {
517                    "oneOf": [
518                        {"type": "number"},
519                        {"type": "array"}
520                    ]
521                }
522            })
523        }
524    }
525}
526
527/// Infer field type from actual data in the document
528fn infer_field_type(
529    type_name: &str,
530    field_name: &str,
531    doc: &Document,
532    config: &SchemaConfig,
533) -> JsonValue {
534    // Find the first instance of this type in the document
535    for item in doc.root.values() {
536        if let Item::List(list) = item {
537            if list.type_name == type_name && !list.rows.is_empty() {
538                // Find the field index
539                if let Some(field_idx) = list.schema.iter().position(|f| f == field_name) {
540                    // Get the first row's value for this field
541                    if let Some(node) = list.rows.first() {
542                        if let Some(value) = node.fields.get(field_idx) {
543                            return value_to_schema(value, Some(field_name), config);
544                        }
545                    }
546                }
547            }
548        }
549    }
550
551    // Default fallback: string type with format hints
552    let mut schema = Map::with_capacity(2);
553    schema.insert("type".to_string(), json!("string"));
554
555    // Smart format detection based on field name
556    if let Some(format) = infer_format_from_name(field_name) {
557        schema.insert("format".to_string(), json!(format));
558    }
559
560    JsonValue::Object(schema)
561}
562
563/// Infer JSON Schema format from string value
564fn infer_string_format(s: &str, field_name: Option<&str>) -> Option<&'static str> {
565    // Email detection
566    if s.contains('@') && s.contains('.') && !s.starts_with('@') {
567        return Some("email");
568    }
569
570    // URI detection
571    if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("ftp://") {
572        return Some("uri");
573    }
574
575    // ISO 8601 date-time detection
576    if s.contains('T') && (s.contains('Z') || s.contains('+') || s.contains('-')) && s.len() >= 19 {
577        // Minimum length for ISO 8601
578        return Some("date-time");
579    }
580
581    // UUID detection
582    if s.len() == 36 && s.chars().filter(|&c| c == '-').count() == 4 {
583        return Some("uuid");
584    }
585
586    // Field name-based inference as fallback
587    infer_format_from_name(field_name?)
588}
589
590/// Infer format from field name
591fn infer_format_from_name(field_name: &str) -> Option<&'static str> {
592    let lower = field_name.to_lowercase();
593
594    if lower.contains("email") {
595        Some("email")
596    } else if lower.contains("url") || lower.contains("uri") {
597        Some("uri")
598    } else if lower.contains("date") || lower.ends_with("_at") || lower.ends_with("_on") {
599        Some("date-time")
600    } else if lower.contains("uuid") || lower.contains("guid") {
601        Some("uuid")
602    } else {
603        None
604    }
605}
606
607/// Pluralize a type name (simple English pluralization)
608fn pluralize(word: &str) -> String {
609    if word.ends_with('s')
610        || word.ends_with('x')
611        || word.ends_with('z')
612        || word.ends_with("ch")
613        || word.ends_with("sh")
614    {
615        format!("{word}es")
616    } else if word.ends_with('y') && !word.ends_with("ay") && !word.ends_with("ey") {
617        format!("{}ies", &word[..word.len() - 1])
618    } else {
619        format!("{word}s")
620    }
621}
622
623/// Validate a JSON Schema for correctness
624///
625/// Validates that the schema:
626/// - Has required `$schema` field (root level only)
627/// - Has required `type` field
628/// - Type is a valid JSON Schema type
629/// - References are well-formed
630///
631/// # Arguments
632///
633/// * `schema` - The JSON Schema to validate
634///
635/// # Returns
636///
637/// Ok(()) if valid, Err with validation message otherwise
638pub fn validate_schema(schema: &JsonValue) -> Result<(), SchemaError> {
639    validate_schema_internal(schema, true)
640}
641
642/// Internal validation function with control over $schema field requirement
643fn validate_schema_internal(
644    schema: &JsonValue,
645    require_schema_field: bool,
646) -> Result<(), SchemaError> {
647    let obj = schema
648        .as_object()
649        .ok_or_else(|| SchemaError::ValidationError("Schema must be an object".to_string()))?;
650
651    // Validate $schema field (only for root schema)
652    if require_schema_field && !obj.contains_key("$schema") {
653        return Err(SchemaError::ValidationError(
654            "Schema must have $schema field".to_string(),
655        ));
656    }
657
658    // Validate type field
659    if !obj.contains_key("type") {
660        return Err(SchemaError::ValidationError(
661            "Schema must have type field".to_string(),
662        ));
663    }
664
665    // Validate type value
666    let schema_type = obj
667        .get("type")
668        .and_then(|v| v.as_str())
669        .ok_or_else(|| SchemaError::ValidationError("type must be a string".to_string()))?;
670
671    let valid_types = [
672        "null", "boolean", "object", "array", "number", "string", "integer",
673    ];
674    if !valid_types.contains(&schema_type) {
675        return Err(SchemaError::ValidationError(format!(
676            "Invalid type: {schema_type}. Must be one of: {valid_types:?}"
677        )));
678    }
679
680    // Recursively validate definitions (without requiring $schema field)
681    if let Some(definitions) = obj.get("definitions") {
682        if let Some(defs) = definitions.as_object() {
683            for (name, def_schema) in defs {
684                validate_schema_internal(def_schema, false).map_err(|e| {
685                    SchemaError::ValidationError(format!("Invalid definition '{name}': {e}"))
686                })?;
687            }
688        }
689    }
690
691    Ok(())
692}
693
694#[cfg(test)]
695mod tests {
696    use super::*;
697
698    #[test]
699    fn test_pluralize() {
700        assert_eq!(pluralize("User"), "Users");
701        assert_eq!(pluralize("Post"), "Posts");
702        assert_eq!(pluralize("Category"), "Categories");
703        assert_eq!(pluralize("Box"), "Boxes");
704        assert_eq!(pluralize("Class"), "Classes");
705    }
706
707    #[test]
708    fn test_infer_string_format_email() {
709        assert_eq!(
710            infer_string_format("alice@example.com", None),
711            Some("email")
712        );
713    }
714
715    #[test]
716    fn test_infer_string_format_uri() {
717        assert_eq!(
718            infer_string_format("https://example.com", None),
719            Some("uri")
720        );
721    }
722
723    #[test]
724    fn test_infer_string_format_datetime() {
725        assert_eq!(
726            infer_string_format("2024-01-01T00:00:00Z", None),
727            Some("date-time")
728        );
729    }
730
731    #[test]
732    fn test_infer_format_from_name() {
733        assert_eq!(infer_format_from_name("email"), Some("email"));
734        assert_eq!(infer_format_from_name("url"), Some("uri"));
735        assert_eq!(infer_format_from_name("created_at"), Some("date-time"));
736        assert_eq!(infer_format_from_name("uuid"), Some("uuid"));
737    }
738
739    #[test]
740    fn test_config_builder() {
741        let config = SchemaConfig::builder()
742            .title("Test")
743            .description("Desc")
744            .strict(true)
745            .build();
746
747        assert_eq!(config.title, Some("Test".to_string()));
748        assert_eq!(config.description, Some("Desc".to_string()));
749        assert!(config.strict);
750    }
751
752    #[test]
753    fn test_default_config() {
754        let config = SchemaConfig::default();
755        assert!(config.title.is_none());
756        assert!(!config.strict);
757        assert!(!config.include_examples);
758        assert!(config.include_metadata);
759    }
760}